"""
Intelligent Question Analysis System

This module provides sophisticated question understanding capabilities that go beyond
hardcoded patterns to dynamically analyze what format of answer is expected.

Key Features:
1. Semantic question analysis using NLP techniques
2. Dynamic format requirement detection
3. Context-aware answer formatting rules
4. Flexible and extensible for any question type

Author: GAIA Enhanced Intelligence System
"""

import re
import logging
from typing import Dict, Any, List, Tuple, Optional, Set
from dataclasses import dataclass
from enum import Enum

logger = logging.getLogger(__name__)


class QuestionIntent(Enum):
    """High-level intents that questions can have."""
    COUNT = "count"                    # How many, count, number of
    IDENTIFY = "identify"              # What is, who is, which
    LIST = "list"                      # List all, name all, enumerate
    EXTRACT = "extract"                # Extract specific information
    COMPARE = "compare"                # Compare, difference, similarity
    CALCULATE = "calculate"            # Mathematical operations
    DESCRIBE = "describe"              # Describe, explain
    CLASSIFY = "classify"              # Categorize, type of
    LOCATE = "locate"                  # Where, location
    TEMPORAL = "temporal"              # When, time-related
    UNKNOWN = "unknown"


class AnswerFormat(Enum):
    """Expected answer formats based on question analysis."""
    NUMBER = "number"                  # Pure numeric: "42", "3.14"
    LIST_ALPHABETICAL = "list_alpha"   # Sorted list: "apple, banana, cherry"
    LIST_CHRONOLOGICAL = "list_chrono" # Time-ordered list
    LIST_NUMERICAL = "list_numeric"    # Number-ordered list
    NAME_FULL = "name_full"            # Full names: "John Smith, Jane Doe"
    NAME_FIRST = "name_first"          # First names only: "John, Jane"
    NAME_LAST = "name_last"            # Last names only: "Smith, Doe"
    NAME_INITIALS = "name_initials"    # Initials: "J.S., J.D."
    TEXT_CONCISE = "text_concise"      # Brief text answer
    TEXT_DETAILED = "text_detailed"    # Detailed explanation
    BOOLEAN = "boolean"                # Yes/No
    DATE = "date"                      # Date format
    PERCENTAGE = "percentage"          # Percentage value
    CURRENCY = "currency"              # Money amount


@dataclass
class QuestionAnalysis:
    """Comprehensive analysis of a question."""
    intent: QuestionIntent
    expected_format: AnswerFormat
    confidence: float
    key_entities: List[str]
    modifiers: List[str]
    context_clues: Dict[str, Any]
    formatting_rules: Dict[str, Any]


class IntelligentQuestionAnalyzer:
    """
    Advanced question analyzer that understands intent and format requirements
    using natural language processing techniques.
    """
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
        # Intent detection patterns
        self.INTENT_PATTERNS = {
            QuestionIntent.COUNT: [
                r'\bhow many\b', r'\bcount\b', r'\bnumber of\b', r'\bhow much\b',
                r'\bquantity\b', r'\btotal\b', r'\bsum\b'
            ],
            QuestionIntent.IDENTIFY: [
                r'\bwhat is\b', r'\bwho is\b', r'\bwhich\b', r'\bwhat are\b',
                r'\bidentify\b', r'\bname the\b', r'\btell me\b'
            ],
            QuestionIntent.LIST: [
                r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bmention all\b',
                r'\bprovide.*list\b', r'\bgive.*examples\b', r'\bwhat are all\b'
            ],
            QuestionIntent.EXTRACT: [
                r'\bextract\b', r'\bfind\b', r'\bget\b', r'\bretrieve\b',
                r'\bshow me\b', r'\bgive me\b'
            ],
            QuestionIntent.CALCULATE: [
                r'\bcalculate\b', r'\bcompute\b', r'\bsolve\b', r'\bfind the value\b',
                r'\bwhat is.*\+\b', r'\bwhat is.*\-\b', r'\bwhat is.*\*\b'
            ],
            QuestionIntent.LOCATE: [
                r'\bwhere\b', r'\blocation\b', r'\bposition\b', r'\bplace\b'
            ],
            QuestionIntent.TEMPORAL: [
                r'\bwhen\b', r'\btime\b', r'\bdate\b', r'\byear\b', r'\bperiod\b'
            ]
        }
        
        # Format detection patterns
        self.FORMAT_PATTERNS = {
            AnswerFormat.NUMBER: [
                r'\bhow many\b', r'\bcount\b', r'\bnumber\b', r'\bquantity\b',
                r'\bhow much\b', r'\btotal\b', r'\bsum\b', r'\btemperature\b',
                r'\bwhat is the temperature\b', r'\bwhat.*temperature\b'
            ],
            AnswerFormat.NAME_LAST: [
                r'\blast name\b', r'\bsurname\b', r'\bfamily name\b',
                r'\blast names of\b', r'\bsurnames of\b', r'\blast names\b',
                r'\bwhat are the last names\b', r'\bthe last names of\b',
                r'\bwho are the authors\b', r'\bwho are the\b.*\bauthors\b'
            ],
            AnswerFormat.NAME_FIRST: [
                r'\bfirst name\b', r'\bgiven name\b', r'\bfirst names of\b',
                r'\bgiven names of\b'
            ],
            AnswerFormat.NAME_FULL: [
                r'\bfull name\b', r'\bcomplete name\b', r'\bwho\b', r'\bactor\b',
                r'\bauthor\b', r'\bwriter\b', r'\bdirector\b'
            ],
            AnswerFormat.LIST_ALPHABETICAL: [
                r'\blist\b', r'\bname all\b', r'\benumerate\b', r'\bwhat are\b',
                r'\blist.*alphabetical\b', r'\balphabetical.*order\b', r'\bin alphabetical order\b'
            ],
            AnswerFormat.PERCENTAGE: [
                r'\bpercentage\b', r'\bpercent\b', r'\b%\b', r'\brate\b'
            ],
            AnswerFormat.BOOLEAN: [
                r'\bis it\b', r'\bcan\b', r'\bdoes\b', r'\bwill\b', r'\btrue or false\b'
            ]
        }
        
        # Context modifiers that affect formatting
        self.CONTEXT_MODIFIERS = {
            'alphabetical': [r'\balphabetical\b', r'\bsorted\b', r'\bordered\b'],
            'chronological': [r'\bchronological\b', r'\btime order\b', r'\bsequence\b'],
            'numerical': [r'\bnumerical\b', r'\bnumber order\b'],
            'concise': [r'\bbrief\b', r'\bshort\b', r'\bconcise\b', r'\bsimple\b'],
            'detailed': [r'\bdetailed\b', r'\bexplain\b', r'\bdescribe\b', r'\belaborate\b'],
            'only': [r'\bonly\b', r'\bjust\b', r'\bmerely\b'],
            'all': [r'\ball\b', r'\bevery\b', r'\beach\b']
        }
    
    def analyze_question(self, question: str) -> QuestionAnalysis:
        """
        Perform comprehensive analysis of a question to determine expected answer format.
        
        Args:
            question: The question to analyze
            
        Returns:
            QuestionAnalysis with intent, format, and formatting rules
        """
        q_lower = question.lower().strip()
        
        # Detect intent
        intent = self._detect_intent(q_lower)
        
        # Detect expected format
        expected_format = self._detect_format(q_lower, intent)
        
        # Extract key entities and modifiers
        key_entities = self._extract_entities(q_lower)
        modifiers = self._extract_modifiers(q_lower)
        
        # Analyze context clues
        context_clues = self._analyze_context(q_lower, intent, expected_format)
        
        # Generate formatting rules
        formatting_rules = self._generate_formatting_rules(
            intent, expected_format, modifiers, context_clues
        )
        
        # Calculate confidence
        confidence = self._calculate_confidence(intent, expected_format, modifiers)
        
        return QuestionAnalysis(
            intent=intent,
            expected_format=expected_format,
            confidence=confidence,
            key_entities=key_entities,
            modifiers=modifiers,
            context_clues=context_clues,
            formatting_rules=formatting_rules
        )
    
    def _detect_intent(self, question: str) -> QuestionIntent:
        """Detect the primary intent of the question."""
        intent_scores = {}
        
        for intent, patterns in self.INTENT_PATTERNS.items():
            score = 0
            for pattern in patterns:
                if re.search(pattern, question):
                    score += 1
            intent_scores[intent] = score
        
        if not intent_scores or max(intent_scores.values()) == 0:
            return QuestionIntent.UNKNOWN
        
        return max(intent_scores, key=intent_scores.get)
    
    def _detect_format(self, question: str, intent: QuestionIntent) -> AnswerFormat:
        """Detect expected answer format based on question and intent."""
        format_scores = {}
        
        for format_type, patterns in self.FORMAT_PATTERNS.items():
            score = 0
            for pattern in patterns:
                if re.search(pattern, question):
                    score += 1
            format_scores[format_type] = score
        
        # Apply intent-based format preferences
        if intent == QuestionIntent.COUNT:
            format_scores[AnswerFormat.NUMBER] = format_scores.get(AnswerFormat.NUMBER, 0) + 2
        elif intent == QuestionIntent.LIST:
            format_scores[AnswerFormat.LIST_ALPHABETICAL] = format_scores.get(AnswerFormat.LIST_ALPHABETICAL, 0) + 2
        elif intent == QuestionIntent.IDENTIFY and any(word in question for word in ['who', 'author', 'actor']):
            format_scores[AnswerFormat.NAME_FULL] = format_scores.get(AnswerFormat.NAME_FULL, 0) + 2
        
        if not format_scores or max(format_scores.values()) == 0:
            return AnswerFormat.TEXT_CONCISE
        
        return max(format_scores, key=format_scores.get)
    
    def _extract_entities(self, question: str) -> List[str]:
        """Extract key entities from the question."""
        entities = []
        
        # Common entity patterns
        entity_patterns = [
            r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b',  # Proper nouns
            r'\b\d+\b',  # Numbers
            r'\b(?:movie|book|song|album|company|country|city)\b'  # Common entity types
        ]
        
        for pattern in entity_patterns:
            matches = re.findall(pattern, question)
            entities.extend(matches)
        
        return list(set(entities))
    
    def _extract_modifiers(self, question: str) -> List[str]:
        """Extract modifiers that affect answer formatting."""
        modifiers = []
        
        for modifier, patterns in self.CONTEXT_MODIFIERS.items():
            for pattern in patterns:
                if re.search(pattern, question):
                    modifiers.append(modifier)
                    break
        
        return modifiers
    
    def _analyze_context(self, question: str, intent: QuestionIntent, 
                        expected_format: AnswerFormat) -> Dict[str, Any]:
        """Analyze contextual clues in the question."""
        context = {
            'question_length': len(question),
            'has_numbers': bool(re.search(r'\d+', question)),
            'has_proper_nouns': bool(re.search(r'\b[A-Z][a-z]+\b', question)),
            'question_words': self._extract_question_words(question),
            'domain_hints': self._detect_domain(question)
        }
        
        return context
    
    def _extract_question_words(self, question: str) -> List[str]:
        """Extract question words (who, what, when, where, why, how)."""
        question_words = []
        patterns = [r'\bwho\b', r'\bwhat\b', r'\bwhen\b', r'\bwhere\b', 
                   r'\bwhy\b', r'\bhow\b', r'\bwhich\b']
        
        for pattern in patterns:
            if re.search(pattern, question):
                question_words.append(pattern.strip('\\b'))
        
        return question_words
    
    def _detect_domain(self, question: str) -> List[str]:
        """Detect domain-specific hints in the question."""
        domains = []
        
        domain_keywords = {
            'sports': ['player', 'team', 'game', 'sport', 'athlete', 'coach'],
            'entertainment': ['movie', 'actor', 'director', 'film', 'show', 'series'],
            'literature': ['book', 'author', 'novel', 'writer', 'poem', 'story'],
            'science': ['experiment', 'research', 'study', 'theory', 'hypothesis'],
            'geography': ['country', 'city', 'location', 'place', 'region'],
            'history': ['year', 'century', 'period', 'era', 'historical'],
            'mathematics': ['calculate', 'equation', 'formula', 'solve', 'compute']
        }
        
        for domain, keywords in domain_keywords.items():
            if any(keyword in question for keyword in keywords):
                domains.append(domain)
        
        return domains
    
    def _generate_formatting_rules(self, intent: QuestionIntent, 
                                 expected_format: AnswerFormat,
                                 modifiers: List[str],
                                 context: Dict[str, Any]) -> Dict[str, Any]:
        """Generate specific formatting rules based on analysis."""
        rules = {
            'extract_numbers_only': expected_format in [AnswerFormat.NUMBER, AnswerFormat.PERCENTAGE],
            'alphabetize_lists': expected_format in [AnswerFormat.LIST_ALPHABETICAL],
            'chronological_order': 'chronological' in modifiers,
            'numerical_order': 'numerical' in modifiers,
            'remove_explanations': 'concise' in modifiers or expected_format == AnswerFormat.NUMBER,
            'include_details': 'detailed' in modifiers,
            'name_format': self._determine_name_format(expected_format),
            'max_length': self._determine_max_length(expected_format, modifiers),
            'case_sensitive': False,
            'preserve_order': 'chronological' in modifiers or 'numerical' in modifiers
        }
        
        return rules
    
    def _determine_name_format(self, expected_format: AnswerFormat) -> str:
        """Determine specific name formatting requirements."""
        format_map = {
            AnswerFormat.NAME_FIRST: 'first',
            AnswerFormat.NAME_LAST: 'last',
            AnswerFormat.NAME_FULL: 'full',
            AnswerFormat.NAME_INITIALS: 'initials'
        }
        return format_map.get(expected_format, 'full')
    
    def _determine_max_length(self, expected_format: AnswerFormat, 
                            modifiers: List[str]) -> int:
        """Determine maximum answer length based on format and modifiers."""
        if 'concise' in modifiers:
            return 50
        elif 'detailed' in modifiers:
            return 500
        elif expected_format == AnswerFormat.NUMBER:
            return 20
        elif expected_format in [AnswerFormat.LIST_ALPHABETICAL, AnswerFormat.LIST_CHRONOLOGICAL]:
            return 300
        else:
            return 200
    
    def _calculate_confidence(self, intent: QuestionIntent, 
                            expected_format: AnswerFormat,
                            modifiers: List[str]) -> float:
        """Calculate confidence score for the analysis."""
        base_confidence = 0.7
        
        # Boost confidence for clear patterns
        if intent != QuestionIntent.UNKNOWN:
            base_confidence += 0.1
        
        if expected_format != AnswerFormat.TEXT_CONCISE:
            base_confidence += 0.1
        
        if modifiers:
            base_confidence += 0.1
        
        return min(1.0, base_confidence)


def analyze_question_intelligently(question: str) -> QuestionAnalysis:
    """
    Convenience function for intelligent question analysis.
    
    Args:
        question: The question to analyze
        
    Returns:
        QuestionAnalysis with comprehensive formatting requirements
    """
    analyzer = IntelligentQuestionAnalyzer()
    return analyzer.analyze_question(question)