Spaces:
Running
Running
""" | |
Intelligent Question Analysis System | |
This module provides sophisticated question understanding capabilities that go beyond | |
hardcoded patterns to dynamically analyze what format of answer is expected. | |
Key Features: | |
1. Semantic question analysis using NLP techniques | |
2. Dynamic format requirement detection | |
3. Context-aware answer formatting rules | |
4. Flexible and extensible for any question type | |
Author: GAIA Enhanced Intelligence System | |
""" | |
import re | |
import logging | |
from typing import Dict, Any, List, Tuple, Optional, Set | |
from dataclasses import dataclass | |
from enum import Enum | |
logger = logging.getLogger(__name__) | |
class QuestionIntent(Enum): | |
"""High-level intents that questions can have.""" | |
COUNT = "count" # How many, count, number of | |
IDENTIFY = "identify" # What is, who is, which | |
LIST = "list" # List all, name all, enumerate | |
EXTRACT = "extract" # Extract specific information | |
COMPARE = "compare" # Compare, difference, similarity | |
CALCULATE = "calculate" # Mathematical operations | |
DESCRIBE = "describe" # Describe, explain | |
CLASSIFY = "classify" # Categorize, type of | |
LOCATE = "locate" # Where, location | |
TEMPORAL = "temporal" # When, time-related | |
UNKNOWN = "unknown" | |
class AnswerFormat(Enum): | |
"""Expected answer formats based on question analysis.""" | |
NUMBER = "number" # Pure numeric: "42", "3.14" | |
LIST_ALPHABETICAL = "list_alpha" # Sorted list: "apple, banana, cherry" | |
LIST_CHRONOLOGICAL = "list_chrono" # Time-ordered list | |
LIST_NUMERICAL = "list_numeric" # Number-ordered list | |
NAME_FULL = "name_full" # Full names: "John Smith, Jane Doe" | |
NAME_FIRST = "name_first" # First names only: "John, Jane" | |
NAME_LAST = "name_last" # Last names only: "Smith, Doe" | |
NAME_INITIALS = "name_initials" # Initials: "J.S., J.D." | |
TEXT_CONCISE = "text_concise" # Brief text answer | |
TEXT_DETAILED = "text_detailed" # Detailed explanation | |
BOOLEAN = "boolean" # Yes/No | |
DATE = "date" # Date format | |
PERCENTAGE = "percentage" # Percentage value | |
CURRENCY = "currency" # Money amount | |
class QuestionAnalysis: | |
"""Comprehensive analysis of a question.""" | |
intent: QuestionIntent | |
expected_format: AnswerFormat | |
confidence: float | |
key_entities: List[str] | |
modifiers: List[str] | |
context_clues: Dict[str, Any] | |
formatting_rules: Dict[str, Any] | |
class IntelligentQuestionAnalyzer: | |
""" | |
Advanced question analyzer that understands intent and format requirements | |
using natural language processing techniques. | |
""" | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
# Intent detection patterns | |
self.INTENT_PATTERNS = { | |
QuestionIntent.COUNT: [ | |
r'\bhow many\b', r'\bcount\b', r'\bnumber of\b', r'\bhow much\b', | |
r'\bquantity\b', r'\btotal\b', r'\bsum\b' | |
], | |
QuestionIntent.IDENTIFY: [ | |
r'\bwhat is\b', r'\bwho is\b', r'\bwhich\b', r'\bwhat are\b', | |
r'\bidentify\b', r'\bname the\b', r'\btell me\b' | |
], | |
QuestionIntent.LIST: [ | |
r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bmention all\b', | |
r'\bprovide.*list\b', r'\bgive.*examples\b', r'\bwhat are all\b' | |
], | |
QuestionIntent.EXTRACT: [ | |
r'\bextract\b', r'\bfind\b', r'\bget\b', r'\bretrieve\b', | |
r'\bshow me\b', r'\bgive me\b' | |
], | |
QuestionIntent.CALCULATE: [ | |
r'\bcalculate\b', r'\bcompute\b', r'\bsolve\b', r'\bfind the value\b', | |
r'\bwhat is.*\+\b', r'\bwhat is.*\-\b', r'\bwhat is.*\*\b' | |
], | |
QuestionIntent.LOCATE: [ | |
r'\bwhere\b', r'\blocation\b', r'\bposition\b', r'\bplace\b' | |
], | |
QuestionIntent.TEMPORAL: [ | |
r'\bwhen\b', r'\btime\b', r'\bdate\b', r'\byear\b', r'\bperiod\b' | |
] | |
} | |
# Format detection patterns | |
self.FORMAT_PATTERNS = { | |
AnswerFormat.NUMBER: [ | |
r'\bhow many\b', r'\bcount\b', r'\bnumber\b', r'\bquantity\b', | |
r'\bhow much\b', r'\btotal\b', r'\bsum\b', r'\btemperature\b', | |
r'\bwhat is the temperature\b', r'\bwhat.*temperature\b' | |
], | |
AnswerFormat.NAME_LAST: [ | |
r'\blast name\b', r'\bsurname\b', r'\bfamily name\b', | |
r'\blast names of\b', r'\bsurnames of\b', r'\blast names\b', | |
r'\bwhat are the last names\b', r'\bthe last names of\b', | |
r'\bwho are the authors\b', r'\bwho are the\b.*\bauthors\b' | |
], | |
AnswerFormat.NAME_FIRST: [ | |
r'\bfirst name\b', r'\bgiven name\b', r'\bfirst names of\b', | |
r'\bgiven names of\b' | |
], | |
AnswerFormat.NAME_FULL: [ | |
r'\bfull name\b', r'\bcomplete name\b', r'\bwho\b', r'\bactor\b', | |
r'\bauthor\b', r'\bwriter\b', r'\bdirector\b' | |
], | |
AnswerFormat.LIST_ALPHABETICAL: [ | |
r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bwhat are\b', | |
r'\blist.*alphabetical\b', r'\balphabetical.*order\b', r'\bin alphabetical order\b' | |
], | |
AnswerFormat.PERCENTAGE: [ | |
r'\bpercentage\b', r'\bpercent\b', r'\b%\b', r'\brate\b' | |
], | |
AnswerFormat.BOOLEAN: [ | |
r'\bis it\b', r'\bcan\b', r'\bdoes\b', r'\bwill\b', r'\btrue or false\b' | |
] | |
} | |
# Context modifiers that affect formatting | |
self.CONTEXT_MODIFIERS = { | |
'alphabetical': [r'\balphabetical\b', r'\bsorted\b', r'\bordered\b'], | |
'chronological': [r'\bchronological\b', r'\btime order\b', r'\bsequence\b'], | |
'numerical': [r'\bnumerical\b', r'\bnumber order\b'], | |
'concise': [r'\bbrief\b', r'\bshort\b', r'\bconcise\b', r'\bsimple\b'], | |
'detailed': [r'\bdetailed\b', r'\bexplain\b', r'\bdescribe\b', r'\belaborate\b'], | |
'only': [r'\bonly\b', r'\bjust\b', r'\bmerely\b'], | |
'all': [r'\ball\b', r'\bevery\b', r'\beach\b'] | |
} | |
def analyze_question(self, question: str) -> QuestionAnalysis: | |
""" | |
Perform comprehensive analysis of a question to determine expected answer format. | |
Args: | |
question: The question to analyze | |
Returns: | |
QuestionAnalysis with intent, format, and formatting rules | |
""" | |
q_lower = question.lower().strip() | |
# Detect intent | |
intent = self._detect_intent(q_lower) | |
# Detect expected format | |
expected_format = self._detect_format(q_lower, intent) | |
# Extract key entities and modifiers | |
key_entities = self._extract_entities(q_lower) | |
modifiers = self._extract_modifiers(q_lower) | |
# Analyze context clues | |
context_clues = self._analyze_context(q_lower, intent, expected_format) | |
# Generate formatting rules | |
formatting_rules = self._generate_formatting_rules( | |
intent, expected_format, modifiers, context_clues | |
) | |
# Calculate confidence | |
confidence = self._calculate_confidence(intent, expected_format, modifiers) | |
return QuestionAnalysis( | |
intent=intent, | |
expected_format=expected_format, | |
confidence=confidence, | |
key_entities=key_entities, | |
modifiers=modifiers, | |
context_clues=context_clues, | |
formatting_rules=formatting_rules | |
) | |
def _detect_intent(self, question: str) -> QuestionIntent: | |
"""Detect the primary intent of the question.""" | |
intent_scores = {} | |
for intent, patterns in self.INTENT_PATTERNS.items(): | |
score = 0 | |
for pattern in patterns: | |
if re.search(pattern, question): | |
score += 1 | |
intent_scores[intent] = score | |
if not intent_scores or max(intent_scores.values()) == 0: | |
return QuestionIntent.UNKNOWN | |
return max(intent_scores, key=intent_scores.get) | |
def _detect_format(self, question: str, intent: QuestionIntent) -> AnswerFormat: | |
"""Detect expected answer format based on question and intent.""" | |
format_scores = {} | |
for format_type, patterns in self.FORMAT_PATTERNS.items(): | |
score = 0 | |
for pattern in patterns: | |
if re.search(pattern, question): | |
score += 1 | |
format_scores[format_type] = score | |
# Apply intent-based format preferences | |
if intent == QuestionIntent.COUNT: | |
format_scores[AnswerFormat.NUMBER] = format_scores.get(AnswerFormat.NUMBER, 0) + 2 | |
elif intent == QuestionIntent.LIST: | |
format_scores[AnswerFormat.LIST_ALPHABETICAL] = format_scores.get(AnswerFormat.LIST_ALPHABETICAL, 0) + 2 | |
elif intent == QuestionIntent.IDENTIFY and any(word in question for word in ['who', 'author', 'actor']): | |
format_scores[AnswerFormat.NAME_FULL] = format_scores.get(AnswerFormat.NAME_FULL, 0) + 2 | |
if not format_scores or max(format_scores.values()) == 0: | |
return AnswerFormat.TEXT_CONCISE | |
return max(format_scores, key=format_scores.get) | |
def _extract_entities(self, question: str) -> List[str]: | |
"""Extract key entities from the question.""" | |
entities = [] | |
# Common entity patterns | |
entity_patterns = [ | |
r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper nouns | |
r'\b\d+\b', # Numbers | |
r'\b(?:movie|book|song|album|company|country|city)\b' # Common entity types | |
] | |
for pattern in entity_patterns: | |
matches = re.findall(pattern, question) | |
entities.extend(matches) | |
return list(set(entities)) | |
def _extract_modifiers(self, question: str) -> List[str]: | |
"""Extract modifiers that affect answer formatting.""" | |
modifiers = [] | |
for modifier, patterns in self.CONTEXT_MODIFIERS.items(): | |
for pattern in patterns: | |
if re.search(pattern, question): | |
modifiers.append(modifier) | |
break | |
return modifiers | |
def _analyze_context(self, question: str, intent: QuestionIntent, | |
expected_format: AnswerFormat) -> Dict[str, Any]: | |
"""Analyze contextual clues in the question.""" | |
context = { | |
'question_length': len(question), | |
'has_numbers': bool(re.search(r'\d+', question)), | |
'has_proper_nouns': bool(re.search(r'\b[A-Z][a-z]+\b', question)), | |
'question_words': self._extract_question_words(question), | |
'domain_hints': self._detect_domain(question) | |
} | |
return context | |
def _extract_question_words(self, question: str) -> List[str]: | |
"""Extract question words (who, what, when, where, why, how).""" | |
question_words = [] | |
patterns = [r'\bwho\b', r'\bwhat\b', r'\bwhen\b', r'\bwhere\b', | |
r'\bwhy\b', r'\bhow\b', r'\bwhich\b'] | |
for pattern in patterns: | |
if re.search(pattern, question): | |
question_words.append(pattern.strip('\\b')) | |
return question_words | |
def _detect_domain(self, question: str) -> List[str]: | |
"""Detect domain-specific hints in the question.""" | |
domains = [] | |
domain_keywords = { | |
'sports': ['player', 'team', 'game', 'sport', 'athlete', 'coach'], | |
'entertainment': ['movie', 'actor', 'director', 'film', 'show', 'series'], | |
'literature': ['book', 'author', 'novel', 'writer', 'poem', 'story'], | |
'science': ['experiment', 'research', 'study', 'theory', 'hypothesis'], | |
'geography': ['country', 'city', 'location', 'place', 'region'], | |
'history': ['year', 'century', 'period', 'era', 'historical'], | |
'mathematics': ['calculate', 'equation', 'formula', 'solve', 'compute'] | |
} | |
for domain, keywords in domain_keywords.items(): | |
if any(keyword in question for keyword in keywords): | |
domains.append(domain) | |
return domains | |
def _generate_formatting_rules(self, intent: QuestionIntent, | |
expected_format: AnswerFormat, | |
modifiers: List[str], | |
context: Dict[str, Any]) -> Dict[str, Any]: | |
"""Generate specific formatting rules based on analysis.""" | |
rules = { | |
'extract_numbers_only': expected_format in [AnswerFormat.NUMBER, AnswerFormat.PERCENTAGE], | |
'alphabetize_lists': expected_format in [AnswerFormat.LIST_ALPHABETICAL], | |
'chronological_order': 'chronological' in modifiers, | |
'numerical_order': 'numerical' in modifiers, | |
'remove_explanations': 'concise' in modifiers or expected_format == AnswerFormat.NUMBER, | |
'include_details': 'detailed' in modifiers, | |
'name_format': self._determine_name_format(expected_format), | |
'max_length': self._determine_max_length(expected_format, modifiers), | |
'case_sensitive': False, | |
'preserve_order': 'chronological' in modifiers or 'numerical' in modifiers | |
} | |
return rules | |
def _determine_name_format(self, expected_format: AnswerFormat) -> str: | |
"""Determine specific name formatting requirements.""" | |
format_map = { | |
AnswerFormat.NAME_FIRST: 'first', | |
AnswerFormat.NAME_LAST: 'last', | |
AnswerFormat.NAME_FULL: 'full', | |
AnswerFormat.NAME_INITIALS: 'initials' | |
} | |
return format_map.get(expected_format, 'full') | |
def _determine_max_length(self, expected_format: AnswerFormat, | |
modifiers: List[str]) -> int: | |
"""Determine maximum answer length based on format and modifiers.""" | |
if 'concise' in modifiers: | |
return 50 | |
elif 'detailed' in modifiers: | |
return 500 | |
elif expected_format == AnswerFormat.NUMBER: | |
return 20 | |
elif expected_format in [AnswerFormat.LIST_ALPHABETICAL, AnswerFormat.LIST_CHRONOLOGICAL]: | |
return 300 | |
else: | |
return 200 | |
def _calculate_confidence(self, intent: QuestionIntent, | |
expected_format: AnswerFormat, | |
modifiers: List[str]) -> float: | |
"""Calculate confidence score for the analysis.""" | |
base_confidence = 0.7 | |
# Boost confidence for clear patterns | |
if intent != QuestionIntent.UNKNOWN: | |
base_confidence += 0.1 | |
if expected_format != AnswerFormat.TEXT_CONCISE: | |
base_confidence += 0.1 | |
if modifiers: | |
base_confidence += 0.1 | |
return min(1.0, base_confidence) | |
def analyze_question_intelligently(question: str) -> QuestionAnalysis: | |
""" | |
Convenience function for intelligent question analysis. | |
Args: | |
question: The question to analyze | |
Returns: | |
QuestionAnalysis with comprehensive formatting requirements | |
""" | |
analyzer = IntelligentQuestionAnalyzer() | |
return analyzer.analyze_question(question) |