gaia-enhanced-agent / utils /intelligent_question_analyzer.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Intelligent Question Analysis System
This module provides sophisticated question understanding capabilities that go beyond
hardcoded patterns to dynamically analyze what format of answer is expected.
Key Features:
1. Semantic question analysis using NLP techniques
2. Dynamic format requirement detection
3. Context-aware answer formatting rules
4. Flexible and extensible for any question type
Author: GAIA Enhanced Intelligence System
"""
import re
import logging
from typing import Dict, Any, List, Tuple, Optional, Set
from dataclasses import dataclass
from enum import Enum
logger = logging.getLogger(__name__)
class QuestionIntent(Enum):
"""High-level intents that questions can have."""
COUNT = "count" # How many, count, number of
IDENTIFY = "identify" # What is, who is, which
LIST = "list" # List all, name all, enumerate
EXTRACT = "extract" # Extract specific information
COMPARE = "compare" # Compare, difference, similarity
CALCULATE = "calculate" # Mathematical operations
DESCRIBE = "describe" # Describe, explain
CLASSIFY = "classify" # Categorize, type of
LOCATE = "locate" # Where, location
TEMPORAL = "temporal" # When, time-related
UNKNOWN = "unknown"
class AnswerFormat(Enum):
"""Expected answer formats based on question analysis."""
NUMBER = "number" # Pure numeric: "42", "3.14"
LIST_ALPHABETICAL = "list_alpha" # Sorted list: "apple, banana, cherry"
LIST_CHRONOLOGICAL = "list_chrono" # Time-ordered list
LIST_NUMERICAL = "list_numeric" # Number-ordered list
NAME_FULL = "name_full" # Full names: "John Smith, Jane Doe"
NAME_FIRST = "name_first" # First names only: "John, Jane"
NAME_LAST = "name_last" # Last names only: "Smith, Doe"
NAME_INITIALS = "name_initials" # Initials: "J.S., J.D."
TEXT_CONCISE = "text_concise" # Brief text answer
TEXT_DETAILED = "text_detailed" # Detailed explanation
BOOLEAN = "boolean" # Yes/No
DATE = "date" # Date format
PERCENTAGE = "percentage" # Percentage value
CURRENCY = "currency" # Money amount
@dataclass
class QuestionAnalysis:
"""Comprehensive analysis of a question."""
intent: QuestionIntent
expected_format: AnswerFormat
confidence: float
key_entities: List[str]
modifiers: List[str]
context_clues: Dict[str, Any]
formatting_rules: Dict[str, Any]
class IntelligentQuestionAnalyzer:
"""
Advanced question analyzer that understands intent and format requirements
using natural language processing techniques.
"""
def __init__(self):
self.logger = logging.getLogger(__name__)
# Intent detection patterns
self.INTENT_PATTERNS = {
QuestionIntent.COUNT: [
r'\bhow many\b', r'\bcount\b', r'\bnumber of\b', r'\bhow much\b',
r'\bquantity\b', r'\btotal\b', r'\bsum\b'
],
QuestionIntent.IDENTIFY: [
r'\bwhat is\b', r'\bwho is\b', r'\bwhich\b', r'\bwhat are\b',
r'\bidentify\b', r'\bname the\b', r'\btell me\b'
],
QuestionIntent.LIST: [
r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bmention all\b',
r'\bprovide.*list\b', r'\bgive.*examples\b', r'\bwhat are all\b'
],
QuestionIntent.EXTRACT: [
r'\bextract\b', r'\bfind\b', r'\bget\b', r'\bretrieve\b',
r'\bshow me\b', r'\bgive me\b'
],
QuestionIntent.CALCULATE: [
r'\bcalculate\b', r'\bcompute\b', r'\bsolve\b', r'\bfind the value\b',
r'\bwhat is.*\+\b', r'\bwhat is.*\-\b', r'\bwhat is.*\*\b'
],
QuestionIntent.LOCATE: [
r'\bwhere\b', r'\blocation\b', r'\bposition\b', r'\bplace\b'
],
QuestionIntent.TEMPORAL: [
r'\bwhen\b', r'\btime\b', r'\bdate\b', r'\byear\b', r'\bperiod\b'
]
}
# Format detection patterns
self.FORMAT_PATTERNS = {
AnswerFormat.NUMBER: [
r'\bhow many\b', r'\bcount\b', r'\bnumber\b', r'\bquantity\b',
r'\bhow much\b', r'\btotal\b', r'\bsum\b', r'\btemperature\b',
r'\bwhat is the temperature\b', r'\bwhat.*temperature\b'
],
AnswerFormat.NAME_LAST: [
r'\blast name\b', r'\bsurname\b', r'\bfamily name\b',
r'\blast names of\b', r'\bsurnames of\b', r'\blast names\b',
r'\bwhat are the last names\b', r'\bthe last names of\b',
r'\bwho are the authors\b', r'\bwho are the\b.*\bauthors\b'
],
AnswerFormat.NAME_FIRST: [
r'\bfirst name\b', r'\bgiven name\b', r'\bfirst names of\b',
r'\bgiven names of\b'
],
AnswerFormat.NAME_FULL: [
r'\bfull name\b', r'\bcomplete name\b', r'\bwho\b', r'\bactor\b',
r'\bauthor\b', r'\bwriter\b', r'\bdirector\b'
],
AnswerFormat.LIST_ALPHABETICAL: [
r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bwhat are\b',
r'\blist.*alphabetical\b', r'\balphabetical.*order\b', r'\bin alphabetical order\b'
],
AnswerFormat.PERCENTAGE: [
r'\bpercentage\b', r'\bpercent\b', r'\b%\b', r'\brate\b'
],
AnswerFormat.BOOLEAN: [
r'\bis it\b', r'\bcan\b', r'\bdoes\b', r'\bwill\b', r'\btrue or false\b'
]
}
# Context modifiers that affect formatting
self.CONTEXT_MODIFIERS = {
'alphabetical': [r'\balphabetical\b', r'\bsorted\b', r'\bordered\b'],
'chronological': [r'\bchronological\b', r'\btime order\b', r'\bsequence\b'],
'numerical': [r'\bnumerical\b', r'\bnumber order\b'],
'concise': [r'\bbrief\b', r'\bshort\b', r'\bconcise\b', r'\bsimple\b'],
'detailed': [r'\bdetailed\b', r'\bexplain\b', r'\bdescribe\b', r'\belaborate\b'],
'only': [r'\bonly\b', r'\bjust\b', r'\bmerely\b'],
'all': [r'\ball\b', r'\bevery\b', r'\beach\b']
}
def analyze_question(self, question: str) -> QuestionAnalysis:
"""
Perform comprehensive analysis of a question to determine expected answer format.
Args:
question: The question to analyze
Returns:
QuestionAnalysis with intent, format, and formatting rules
"""
q_lower = question.lower().strip()
# Detect intent
intent = self._detect_intent(q_lower)
# Detect expected format
expected_format = self._detect_format(q_lower, intent)
# Extract key entities and modifiers
key_entities = self._extract_entities(q_lower)
modifiers = self._extract_modifiers(q_lower)
# Analyze context clues
context_clues = self._analyze_context(q_lower, intent, expected_format)
# Generate formatting rules
formatting_rules = self._generate_formatting_rules(
intent, expected_format, modifiers, context_clues
)
# Calculate confidence
confidence = self._calculate_confidence(intent, expected_format, modifiers)
return QuestionAnalysis(
intent=intent,
expected_format=expected_format,
confidence=confidence,
key_entities=key_entities,
modifiers=modifiers,
context_clues=context_clues,
formatting_rules=formatting_rules
)
def _detect_intent(self, question: str) -> QuestionIntent:
"""Detect the primary intent of the question."""
intent_scores = {}
for intent, patterns in self.INTENT_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, question):
score += 1
intent_scores[intent] = score
if not intent_scores or max(intent_scores.values()) == 0:
return QuestionIntent.UNKNOWN
return max(intent_scores, key=intent_scores.get)
def _detect_format(self, question: str, intent: QuestionIntent) -> AnswerFormat:
"""Detect expected answer format based on question and intent."""
format_scores = {}
for format_type, patterns in self.FORMAT_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, question):
score += 1
format_scores[format_type] = score
# Apply intent-based format preferences
if intent == QuestionIntent.COUNT:
format_scores[AnswerFormat.NUMBER] = format_scores.get(AnswerFormat.NUMBER, 0) + 2
elif intent == QuestionIntent.LIST:
format_scores[AnswerFormat.LIST_ALPHABETICAL] = format_scores.get(AnswerFormat.LIST_ALPHABETICAL, 0) + 2
elif intent == QuestionIntent.IDENTIFY and any(word in question for word in ['who', 'author', 'actor']):
format_scores[AnswerFormat.NAME_FULL] = format_scores.get(AnswerFormat.NAME_FULL, 0) + 2
if not format_scores or max(format_scores.values()) == 0:
return AnswerFormat.TEXT_CONCISE
return max(format_scores, key=format_scores.get)
def _extract_entities(self, question: str) -> List[str]:
"""Extract key entities from the question."""
entities = []
# Common entity patterns
entity_patterns = [
r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper nouns
r'\b\d+\b', # Numbers
r'\b(?:movie|book|song|album|company|country|city)\b' # Common entity types
]
for pattern in entity_patterns:
matches = re.findall(pattern, question)
entities.extend(matches)
return list(set(entities))
def _extract_modifiers(self, question: str) -> List[str]:
"""Extract modifiers that affect answer formatting."""
modifiers = []
for modifier, patterns in self.CONTEXT_MODIFIERS.items():
for pattern in patterns:
if re.search(pattern, question):
modifiers.append(modifier)
break
return modifiers
def _analyze_context(self, question: str, intent: QuestionIntent,
expected_format: AnswerFormat) -> Dict[str, Any]:
"""Analyze contextual clues in the question."""
context = {
'question_length': len(question),
'has_numbers': bool(re.search(r'\d+', question)),
'has_proper_nouns': bool(re.search(r'\b[A-Z][a-z]+\b', question)),
'question_words': self._extract_question_words(question),
'domain_hints': self._detect_domain(question)
}
return context
def _extract_question_words(self, question: str) -> List[str]:
"""Extract question words (who, what, when, where, why, how)."""
question_words = []
patterns = [r'\bwho\b', r'\bwhat\b', r'\bwhen\b', r'\bwhere\b',
r'\bwhy\b', r'\bhow\b', r'\bwhich\b']
for pattern in patterns:
if re.search(pattern, question):
question_words.append(pattern.strip('\\b'))
return question_words
def _detect_domain(self, question: str) -> List[str]:
"""Detect domain-specific hints in the question."""
domains = []
domain_keywords = {
'sports': ['player', 'team', 'game', 'sport', 'athlete', 'coach'],
'entertainment': ['movie', 'actor', 'director', 'film', 'show', 'series'],
'literature': ['book', 'author', 'novel', 'writer', 'poem', 'story'],
'science': ['experiment', 'research', 'study', 'theory', 'hypothesis'],
'geography': ['country', 'city', 'location', 'place', 'region'],
'history': ['year', 'century', 'period', 'era', 'historical'],
'mathematics': ['calculate', 'equation', 'formula', 'solve', 'compute']
}
for domain, keywords in domain_keywords.items():
if any(keyword in question for keyword in keywords):
domains.append(domain)
return domains
def _generate_formatting_rules(self, intent: QuestionIntent,
expected_format: AnswerFormat,
modifiers: List[str],
context: Dict[str, Any]) -> Dict[str, Any]:
"""Generate specific formatting rules based on analysis."""
rules = {
'extract_numbers_only': expected_format in [AnswerFormat.NUMBER, AnswerFormat.PERCENTAGE],
'alphabetize_lists': expected_format in [AnswerFormat.LIST_ALPHABETICAL],
'chronological_order': 'chronological' in modifiers,
'numerical_order': 'numerical' in modifiers,
'remove_explanations': 'concise' in modifiers or expected_format == AnswerFormat.NUMBER,
'include_details': 'detailed' in modifiers,
'name_format': self._determine_name_format(expected_format),
'max_length': self._determine_max_length(expected_format, modifiers),
'case_sensitive': False,
'preserve_order': 'chronological' in modifiers or 'numerical' in modifiers
}
return rules
def _determine_name_format(self, expected_format: AnswerFormat) -> str:
"""Determine specific name formatting requirements."""
format_map = {
AnswerFormat.NAME_FIRST: 'first',
AnswerFormat.NAME_LAST: 'last',
AnswerFormat.NAME_FULL: 'full',
AnswerFormat.NAME_INITIALS: 'initials'
}
return format_map.get(expected_format, 'full')
def _determine_max_length(self, expected_format: AnswerFormat,
modifiers: List[str]) -> int:
"""Determine maximum answer length based on format and modifiers."""
if 'concise' in modifiers:
return 50
elif 'detailed' in modifiers:
return 500
elif expected_format == AnswerFormat.NUMBER:
return 20
elif expected_format in [AnswerFormat.LIST_ALPHABETICAL, AnswerFormat.LIST_CHRONOLOGICAL]:
return 300
else:
return 200
def _calculate_confidence(self, intent: QuestionIntent,
expected_format: AnswerFormat,
modifiers: List[str]) -> float:
"""Calculate confidence score for the analysis."""
base_confidence = 0.7
# Boost confidence for clear patterns
if intent != QuestionIntent.UNKNOWN:
base_confidence += 0.1
if expected_format != AnswerFormat.TEXT_CONCISE:
base_confidence += 0.1
if modifiers:
base_confidence += 0.1
return min(1.0, base_confidence)
def analyze_question_intelligently(question: str) -> QuestionAnalysis:
"""
Convenience function for intelligent question analysis.
Args:
question: The question to analyze
Returns:
QuestionAnalysis with comprehensive formatting requirements
"""
analyzer = IntelligentQuestionAnalyzer()
return analyzer.analyze_question(question)