""" Intelligent Question Analysis System This module provides sophisticated question understanding capabilities that go beyond hardcoded patterns to dynamically analyze what format of answer is expected. Key Features: 1. Semantic question analysis using NLP techniques 2. Dynamic format requirement detection 3. Context-aware answer formatting rules 4. Flexible and extensible for any question type Author: GAIA Enhanced Intelligence System """ import re import logging from typing import Dict, Any, List, Tuple, Optional, Set from dataclasses import dataclass from enum import Enum logger = logging.getLogger(__name__) class QuestionIntent(Enum): """High-level intents that questions can have.""" COUNT = "count" # How many, count, number of IDENTIFY = "identify" # What is, who is, which LIST = "list" # List all, name all, enumerate EXTRACT = "extract" # Extract specific information COMPARE = "compare" # Compare, difference, similarity CALCULATE = "calculate" # Mathematical operations DESCRIBE = "describe" # Describe, explain CLASSIFY = "classify" # Categorize, type of LOCATE = "locate" # Where, location TEMPORAL = "temporal" # When, time-related UNKNOWN = "unknown" class AnswerFormat(Enum): """Expected answer formats based on question analysis.""" NUMBER = "number" # Pure numeric: "42", "3.14" LIST_ALPHABETICAL = "list_alpha" # Sorted list: "apple, banana, cherry" LIST_CHRONOLOGICAL = "list_chrono" # Time-ordered list LIST_NUMERICAL = "list_numeric" # Number-ordered list NAME_FULL = "name_full" # Full names: "John Smith, Jane Doe" NAME_FIRST = "name_first" # First names only: "John, Jane" NAME_LAST = "name_last" # Last names only: "Smith, Doe" NAME_INITIALS = "name_initials" # Initials: "J.S., J.D." TEXT_CONCISE = "text_concise" # Brief text answer TEXT_DETAILED = "text_detailed" # Detailed explanation BOOLEAN = "boolean" # Yes/No DATE = "date" # Date format PERCENTAGE = "percentage" # Percentage value CURRENCY = "currency" # Money amount @dataclass class QuestionAnalysis: """Comprehensive analysis of a question.""" intent: QuestionIntent expected_format: AnswerFormat confidence: float key_entities: List[str] modifiers: List[str] context_clues: Dict[str, Any] formatting_rules: Dict[str, Any] class IntelligentQuestionAnalyzer: """ Advanced question analyzer that understands intent and format requirements using natural language processing techniques. """ def __init__(self): self.logger = logging.getLogger(__name__) # Intent detection patterns self.INTENT_PATTERNS = { QuestionIntent.COUNT: [ r'\bhow many\b', r'\bcount\b', r'\bnumber of\b', r'\bhow much\b', r'\bquantity\b', r'\btotal\b', r'\bsum\b' ], QuestionIntent.IDENTIFY: [ r'\bwhat is\b', r'\bwho is\b', r'\bwhich\b', r'\bwhat are\b', r'\bidentify\b', r'\bname the\b', r'\btell me\b' ], QuestionIntent.LIST: [ r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bmention all\b', r'\bprovide.*list\b', r'\bgive.*examples\b', r'\bwhat are all\b' ], QuestionIntent.EXTRACT: [ r'\bextract\b', r'\bfind\b', r'\bget\b', r'\bretrieve\b', r'\bshow me\b', r'\bgive me\b' ], QuestionIntent.CALCULATE: [ r'\bcalculate\b', r'\bcompute\b', r'\bsolve\b', r'\bfind the value\b', r'\bwhat is.*\+\b', r'\bwhat is.*\-\b', r'\bwhat is.*\*\b' ], QuestionIntent.LOCATE: [ r'\bwhere\b', r'\blocation\b', r'\bposition\b', r'\bplace\b' ], QuestionIntent.TEMPORAL: [ r'\bwhen\b', r'\btime\b', r'\bdate\b', r'\byear\b', r'\bperiod\b' ] } # Format detection patterns self.FORMAT_PATTERNS = { AnswerFormat.NUMBER: [ r'\bhow many\b', r'\bcount\b', r'\bnumber\b', r'\bquantity\b', r'\bhow much\b', r'\btotal\b', r'\bsum\b', r'\btemperature\b', r'\bwhat is the temperature\b', r'\bwhat.*temperature\b' ], AnswerFormat.NAME_LAST: [ r'\blast name\b', r'\bsurname\b', r'\bfamily name\b', r'\blast names of\b', r'\bsurnames of\b', r'\blast names\b', r'\bwhat are the last names\b', r'\bthe last names of\b', r'\bwho are the authors\b', r'\bwho are the\b.*\bauthors\b' ], AnswerFormat.NAME_FIRST: [ r'\bfirst name\b', r'\bgiven name\b', r'\bfirst names of\b', r'\bgiven names of\b' ], AnswerFormat.NAME_FULL: [ r'\bfull name\b', r'\bcomplete name\b', r'\bwho\b', r'\bactor\b', r'\bauthor\b', r'\bwriter\b', r'\bdirector\b' ], AnswerFormat.LIST_ALPHABETICAL: [ r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bwhat are\b', r'\blist.*alphabetical\b', r'\balphabetical.*order\b', r'\bin alphabetical order\b' ], AnswerFormat.PERCENTAGE: [ r'\bpercentage\b', r'\bpercent\b', r'\b%\b', r'\brate\b' ], AnswerFormat.BOOLEAN: [ r'\bis it\b', r'\bcan\b', r'\bdoes\b', r'\bwill\b', r'\btrue or false\b' ] } # Context modifiers that affect formatting self.CONTEXT_MODIFIERS = { 'alphabetical': [r'\balphabetical\b', r'\bsorted\b', r'\bordered\b'], 'chronological': [r'\bchronological\b', r'\btime order\b', r'\bsequence\b'], 'numerical': [r'\bnumerical\b', r'\bnumber order\b'], 'concise': [r'\bbrief\b', r'\bshort\b', r'\bconcise\b', r'\bsimple\b'], 'detailed': [r'\bdetailed\b', r'\bexplain\b', r'\bdescribe\b', r'\belaborate\b'], 'only': [r'\bonly\b', r'\bjust\b', r'\bmerely\b'], 'all': [r'\ball\b', r'\bevery\b', r'\beach\b'] } def analyze_question(self, question: str) -> QuestionAnalysis: """ Perform comprehensive analysis of a question to determine expected answer format. Args: question: The question to analyze Returns: QuestionAnalysis with intent, format, and formatting rules """ q_lower = question.lower().strip() # Detect intent intent = self._detect_intent(q_lower) # Detect expected format expected_format = self._detect_format(q_lower, intent) # Extract key entities and modifiers key_entities = self._extract_entities(q_lower) modifiers = self._extract_modifiers(q_lower) # Analyze context clues context_clues = self._analyze_context(q_lower, intent, expected_format) # Generate formatting rules formatting_rules = self._generate_formatting_rules( intent, expected_format, modifiers, context_clues ) # Calculate confidence confidence = self._calculate_confidence(intent, expected_format, modifiers) return QuestionAnalysis( intent=intent, expected_format=expected_format, confidence=confidence, key_entities=key_entities, modifiers=modifiers, context_clues=context_clues, formatting_rules=formatting_rules ) def _detect_intent(self, question: str) -> QuestionIntent: """Detect the primary intent of the question.""" intent_scores = {} for intent, patterns in self.INTENT_PATTERNS.items(): score = 0 for pattern in patterns: if re.search(pattern, question): score += 1 intent_scores[intent] = score if not intent_scores or max(intent_scores.values()) == 0: return QuestionIntent.UNKNOWN return max(intent_scores, key=intent_scores.get) def _detect_format(self, question: str, intent: QuestionIntent) -> AnswerFormat: """Detect expected answer format based on question and intent.""" format_scores = {} for format_type, patterns in self.FORMAT_PATTERNS.items(): score = 0 for pattern in patterns: if re.search(pattern, question): score += 1 format_scores[format_type] = score # Apply intent-based format preferences if intent == QuestionIntent.COUNT: format_scores[AnswerFormat.NUMBER] = format_scores.get(AnswerFormat.NUMBER, 0) + 2 elif intent == QuestionIntent.LIST: format_scores[AnswerFormat.LIST_ALPHABETICAL] = format_scores.get(AnswerFormat.LIST_ALPHABETICAL, 0) + 2 elif intent == QuestionIntent.IDENTIFY and any(word in question for word in ['who', 'author', 'actor']): format_scores[AnswerFormat.NAME_FULL] = format_scores.get(AnswerFormat.NAME_FULL, 0) + 2 if not format_scores or max(format_scores.values()) == 0: return AnswerFormat.TEXT_CONCISE return max(format_scores, key=format_scores.get) def _extract_entities(self, question: str) -> List[str]: """Extract key entities from the question.""" entities = [] # Common entity patterns entity_patterns = [ r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper nouns r'\b\d+\b', # Numbers r'\b(?:movie|book|song|album|company|country|city)\b' # Common entity types ] for pattern in entity_patterns: matches = re.findall(pattern, question) entities.extend(matches) return list(set(entities)) def _extract_modifiers(self, question: str) -> List[str]: """Extract modifiers that affect answer formatting.""" modifiers = [] for modifier, patterns in self.CONTEXT_MODIFIERS.items(): for pattern in patterns: if re.search(pattern, question): modifiers.append(modifier) break return modifiers def _analyze_context(self, question: str, intent: QuestionIntent, expected_format: AnswerFormat) -> Dict[str, Any]: """Analyze contextual clues in the question.""" context = { 'question_length': len(question), 'has_numbers': bool(re.search(r'\d+', question)), 'has_proper_nouns': bool(re.search(r'\b[A-Z][a-z]+\b', question)), 'question_words': self._extract_question_words(question), 'domain_hints': self._detect_domain(question) } return context def _extract_question_words(self, question: str) -> List[str]: """Extract question words (who, what, when, where, why, how).""" question_words = [] patterns = [r'\bwho\b', r'\bwhat\b', r'\bwhen\b', r'\bwhere\b', r'\bwhy\b', r'\bhow\b', r'\bwhich\b'] for pattern in patterns: if re.search(pattern, question): question_words.append(pattern.strip('\\b')) return question_words def _detect_domain(self, question: str) -> List[str]: """Detect domain-specific hints in the question.""" domains = [] domain_keywords = { 'sports': ['player', 'team', 'game', 'sport', 'athlete', 'coach'], 'entertainment': ['movie', 'actor', 'director', 'film', 'show', 'series'], 'literature': ['book', 'author', 'novel', 'writer', 'poem', 'story'], 'science': ['experiment', 'research', 'study', 'theory', 'hypothesis'], 'geography': ['country', 'city', 'location', 'place', 'region'], 'history': ['year', 'century', 'period', 'era', 'historical'], 'mathematics': ['calculate', 'equation', 'formula', 'solve', 'compute'] } for domain, keywords in domain_keywords.items(): if any(keyword in question for keyword in keywords): domains.append(domain) return domains def _generate_formatting_rules(self, intent: QuestionIntent, expected_format: AnswerFormat, modifiers: List[str], context: Dict[str, Any]) -> Dict[str, Any]: """Generate specific formatting rules based on analysis.""" rules = { 'extract_numbers_only': expected_format in [AnswerFormat.NUMBER, AnswerFormat.PERCENTAGE], 'alphabetize_lists': expected_format in [AnswerFormat.LIST_ALPHABETICAL], 'chronological_order': 'chronological' in modifiers, 'numerical_order': 'numerical' in modifiers, 'remove_explanations': 'concise' in modifiers or expected_format == AnswerFormat.NUMBER, 'include_details': 'detailed' in modifiers, 'name_format': self._determine_name_format(expected_format), 'max_length': self._determine_max_length(expected_format, modifiers), 'case_sensitive': False, 'preserve_order': 'chronological' in modifiers or 'numerical' in modifiers } return rules def _determine_name_format(self, expected_format: AnswerFormat) -> str: """Determine specific name formatting requirements.""" format_map = { AnswerFormat.NAME_FIRST: 'first', AnswerFormat.NAME_LAST: 'last', AnswerFormat.NAME_FULL: 'full', AnswerFormat.NAME_INITIALS: 'initials' } return format_map.get(expected_format, 'full') def _determine_max_length(self, expected_format: AnswerFormat, modifiers: List[str]) -> int: """Determine maximum answer length based on format and modifiers.""" if 'concise' in modifiers: return 50 elif 'detailed' in modifiers: return 500 elif expected_format == AnswerFormat.NUMBER: return 20 elif expected_format in [AnswerFormat.LIST_ALPHABETICAL, AnswerFormat.LIST_CHRONOLOGICAL]: return 300 else: return 200 def _calculate_confidence(self, intent: QuestionIntent, expected_format: AnswerFormat, modifiers: List[str]) -> float: """Calculate confidence score for the analysis.""" base_confidence = 0.7 # Boost confidence for clear patterns if intent != QuestionIntent.UNKNOWN: base_confidence += 0.1 if expected_format != AnswerFormat.TEXT_CONCISE: base_confidence += 0.1 if modifiers: base_confidence += 0.1 return min(1.0, base_confidence) def analyze_question_intelligently(question: str) -> QuestionAnalysis: """ Convenience function for intelligent question analysis. Args: question: The question to analyze Returns: QuestionAnalysis with comprehensive formatting requirements """ analyzer = IntelligentQuestionAnalyzer() return analyzer.analyze_question(question)