""" Question Classifier Module This module provides a simplified 3-way classification system for questions: 1. calculation - Mathematical operations, conversions, computations 2. url - Questions that require specific URL/webpage access 3. general_web_search - Questions that need web research using search engines Extracted from BasicAgent._classify_question() method in app.py for clean separation of concerns. """ from typing import Dict, List, Tuple, Optional import re class QuestionClassifier: """ Simplified question classifier that categorizes questions into 3 main types: - calculation: Math operations, unit conversions, numerical computations - url: Questions requiring specific URL access or known webpage content - general_web_search: Questions needing web search for factual information """ def __init__(self): """Initialize the classifier with pattern definitions.""" self._init_classification_patterns() self._init_priority_rules() def _init_classification_patterns(self): """Initialize keyword patterns for each classification category.""" # Calculation patterns - mathematical operations and conversions self.calculation_patterns = { 'arithmetic': [ 'calculate', 'compute', 'what is', '+', '-', '*', '/', 'plus', 'minus', 'times', 'multiply', 'divide', 'sum', 'product', 'add', 'subtract', 'difference' ], 'percentage': [ 'percent', '%', 'percentage', 'rate', 'ratio' ], 'conversion': [ 'convert', 'meters', 'feet', 'inches', 'celsius', 'fahrenheit', 'miles', 'kilometers', 'pounds', 'kilograms', 'temperature', 'length', 'weight', 'distance', 'from', 'to' ], 'financial': [ 'compound', 'interest', 'investment', 'principal', 'rate', 'growth', 'productivity', 'quarter', 'quarters' ] } # URL patterns - questions requiring specific webpage access self.url_patterns = { 'specific_sites': [ 'wikipedia', 'universe today', 'nasa', 'featured article', 'discography', 'promoted', 'nominated', 'publication', 'article published', 'website', 'blog post' ], 'specific_content': [ 'mercedes sosa', 'albums', 'dinosaur article', 'november 2016', 'june 6 2023', 'carolyn collins petersen', 'award number', 'between 2000 and 2009', '2000-2009', 'release', 'released' ], 'artist_discography': [ 'mercedes sosa albums', 'discography', 'studio albums', 'albums released', 'albums between' ] } # General web search patterns - factual questions needing search self.general_web_search_patterns = { 'geography': [ 'capital', 'country', 'city', 'continent', 'ocean', 'mountain', 'river', 'largest', 'biggest', 'smallest', 'population', 'area', 'border', 'location' ], 'history': [ 'when', 'born', 'birth', 'died', 'death', 'war', 'battle', 'founded', 'established', 'year', 'date', 'historical', 'ancient', 'century' ], 'science': [ 'formula', 'element', 'compound', 'speed', 'light', 'physics', 'chemistry', 'biology', 'boiling', 'freezing', 'point', 'water', 'scientific', 'discovery', 'theory' ], 'counting': [ 'how many', 'number of', 'count', 'total', 'continents', 'planets', 'states', 'oceans', 'countries', 'people' ], 'current_events': [ 'today', 'current', 'latest', 'recent', 'now', '2024', '2025', 'news', 'happening' ], 'general_facts': [ 'who', 'what', 'where', 'why', 'how', 'definition', 'meaning', 'explain', 'describe' ] } def _init_priority_rules(self): """Initialize priority rules for classification conflicts.""" # Priority order for 3-way classification (most specific to least specific) self.classification_priority = [ 'calculation', 'url', 'general_web_search' ] # Sub-category priority within calculation self.calculation_subcategory_priority = [ 'conversion', 'financial', 'percentage', 'arithmetic' ] # Sub-category priority within URL self.url_subcategory_priority = [ 'artist_discography', 'specific_content', 'specific_sites' ] # Sub-category priority within general web search self.general_web_search_subcategory_priority = [ 'counting', 'geography', 'history', 'science', 'current_events', 'general_facts' ] def classify_question(self, question: str) -> str: """ Classify a question into one of three categories. Args: question (str): The question to classify Returns: str: One of 'calculation', 'url', or 'general_web_search' """ if not question or not isinstance(question, str): return 'general_web_search' # Clean and prepare the question q_lower = question.lower().strip() # Get classification scores for each category scores = self._calculate_classification_scores(q_lower) # Apply classification logic with priority rules classification = self._apply_classification_rules(scores, q_lower) return classification def classify_with_confidence(self, question: str) -> Tuple[str, float, Dict[str, int]]: """ Classify a question and return classification with confidence score and details. Args: question (str): The question to classify Returns: Tuple[str, float, Dict[str, int]]: (classification, confidence, detailed_scores) """ if not question or not isinstance(question, str): return 'general_web_search', 0.0, {} q_lower = question.lower().strip() scores = self._calculate_classification_scores(q_lower) classification = self._apply_classification_rules(scores, q_lower) # Calculate confidence based on score distribution confidence = self._calculate_confidence(scores, classification) return classification, confidence, scores def _calculate_classification_scores(self, question: str) -> Dict[str, int]: """Calculate keyword match scores for each classification category.""" scores = { 'calculation': 0, 'url': 0, 'general_web_search': 0 } # Score calculation patterns calc_score = 0 for subcategory, keywords in self.calculation_patterns.items(): calc_score += sum(1 for keyword in keywords if keyword in question) scores['calculation'] = calc_score # Score URL patterns url_score = 0 for subcategory, keywords in self.url_patterns.items(): url_score += sum(1 for keyword in keywords if keyword in question) scores['url'] = url_score # Score general web search patterns web_score = 0 for subcategory, keywords in self.general_web_search_patterns.items(): web_score += sum(1 for keyword in keywords if keyword in question) scores['general_web_search'] = web_score return scores def _apply_classification_rules(self, scores: Dict[str, int], question: str) -> str: """Apply classification rules with priority handling.""" # If no patterns match, default to general web search if all(score == 0 for score in scores.values()): return 'general_web_search' # Apply specific pattern detection rules classification = self._apply_specific_rules(question, scores) if classification: return classification # Handle ties and conflicts using priority rules max_score = max(scores.values()) tied_categories = [cat for cat, score in scores.items() if score == max_score] # If only one category has the max score, return it if len(tied_categories) == 1: return tied_categories[0] # Resolve ties using priority order for category in self.classification_priority: if category in tied_categories: return category # Fallback to highest score return max(scores, key=scores.get) def _apply_specific_rules(self, question: str, scores: Dict[str, int]) -> Optional[str]: """Apply specific detection rules for edge cases.""" # Strong calculation indicators if any(pattern in question for pattern in ['+', '-', '*', '/', '%']): return 'calculation' # Mathematical expressions or numbers with operations if re.search(r'\d+\s*[+\-*/]\s*\d+', question): return 'calculation' # Conversion phrases if re.search(r'\d+.*(?:to|in|convert).*(?:feet|meters|celsius|fahrenheit)', question): return 'calculation' # Specific URL-type questions url_indicators = [ 'wikipedia.*article.*promoted', 'universe today.*published', 'nasa.*award.*number', 'discography.*albums.*between', 'mercedes sosa.*albums.*between', 'albums.*release.*between', 'dinosaur.*article.*wikipedia', 'nominated.*wikipedia.*featured' ] for pattern in url_indicators: if re.search(pattern, question): return 'url' # Additional artist discography checks if ('mercedes sosa' in question and 'albums' in question) or \ ('discography' in question and any(year in question for year in ['2000', '2009'])): return 'url' # Strong web search indicators if question.startswith(('who ', 'what ', 'where ', 'when ', 'how many ')): # But not if it's clearly mathematical if not any(word in question for word in ['calculate', 'compute', '+', '-', '*', '/']): return 'general_web_search' return None def _calculate_confidence(self, scores: Dict[str, int], classification: str) -> float: """Calculate confidence score for the classification.""" total_score = sum(scores.values()) if total_score == 0: return 0.0 classified_score = scores[classification] confidence = classified_score / total_score # Adjust confidence based on score distribution other_scores = [score for cat, score in scores.items() if cat != classification] max_other_score = max(other_scores) if other_scores else 0 # If classification score is much higher than others, increase confidence if classified_score > max_other_score * 1.5: confidence = min(1.0, confidence * 1.2) return round(confidence, 2) def get_detailed_analysis(self, question: str) -> Dict[str, any]: """ Get detailed analysis of question classification including subcategory matches. Args: question (str): The question to analyze Returns: Dict: Detailed analysis including subcategory matches and reasoning """ if not question or not isinstance(question, str): return {'error': 'Invalid question input'} q_lower = question.lower().strip() classification, confidence, scores = self.classify_with_confidence(question) # Get subcategory matches subcategory_matches = self._get_subcategory_matches(q_lower) # Identify specific patterns that influenced classification influencing_patterns = self._get_influencing_patterns(q_lower, classification) return { 'question': question, 'classification': classification, 'confidence': confidence, 'category_scores': scores, 'subcategory_matches': subcategory_matches, 'influencing_patterns': influencing_patterns, 'reasoning': self._generate_reasoning(classification, scores, subcategory_matches) } def _get_subcategory_matches(self, question: str) -> Dict[str, List[str]]: """Get matches for each subcategory.""" matches = { 'calculation': {}, 'url': {}, 'general_web_search': {} } # Check calculation subcategories for subcategory, keywords in self.calculation_patterns.items(): matched = [kw for kw in keywords if kw in question] if matched: matches['calculation'][subcategory] = matched # Check URL subcategories for subcategory, keywords in self.url_patterns.items(): matched = [kw for kw in keywords if kw in question] if matched: matches['url'][subcategory] = matched # Check general web search subcategories for subcategory, keywords in self.general_web_search_patterns.items(): matched = [kw for kw in keywords if kw in question] if matched: matches['general_web_search'][subcategory] = matched return matches def _get_influencing_patterns(self, question: str, classification: str) -> List[str]: """Get the specific patterns that influenced the classification.""" patterns = [] # Mathematical operators if re.search(r'[+\-*/]', question): patterns.append('mathematical_operators') # Numbers with operations if re.search(r'\d+\s*[+\-*/]\s*\d+', question): patterns.append('numeric_expression') # Conversion patterns if re.search(r'convert|to|in.*(?:feet|meters|celsius|fahrenheit)', question): patterns.append('unit_conversion') # Question words question_words = ['who', 'what', 'where', 'when', 'how', 'why'] for word in question_words: if question.startswith(word + ' '): patterns.append(f'question_word_{word}') # Specific site mentions if 'wikipedia' in question: patterns.append('wikipedia_mention') if 'universe today' in question: patterns.append('universe_today_mention') return patterns def _generate_reasoning(self, classification: str, scores: Dict[str, int], subcategory_matches: Dict[str, Dict[str, List[str]]]) -> str: """Generate human-readable reasoning for the classification.""" reasoning_parts = [] # Main classification reasoning if classification == 'calculation': reasoning_parts.append("Classified as calculation due to mathematical content") if subcategory_matches['calculation']: subcats = list(subcategory_matches['calculation'].keys()) reasoning_parts.append(f"Detected {', '.join(subcats)} patterns") elif classification == 'url': reasoning_parts.append("Classified as URL access due to specific site/content references") if subcategory_matches['url']: subcats = list(subcategory_matches['url'].keys()) reasoning_parts.append(f"Detected {', '.join(subcats)} patterns") else: # general_web_search reasoning_parts.append("Classified as general web search for factual information") if subcategory_matches['general_web_search']: subcats = list(subcategory_matches['general_web_search'].keys()) reasoning_parts.append(f"Detected {', '.join(subcats)} patterns") # Score information max_score = max(scores.values()) if max_score > 0: reasoning_parts.append(f"Primary score: {scores[classification]}/{max_score}") return ". ".join(reasoning_parts) # Convenience functions for backward compatibility def classify_question(question: str) -> str: """ Convenience function to classify a single question. Args: question (str): The question to classify Returns: str: One of 'calculation', 'url', or 'general_web_search' """ classifier = QuestionClassifier() return classifier.classify_question(question) def get_question_analysis(question: str) -> Dict[str, any]: """ Convenience function to get detailed analysis of a question. Args: question (str): The question to analyze Returns: Dict: Detailed analysis including classification and reasoning """ classifier = QuestionClassifier() return classifier.get_detailed_analysis(question) # Example usage and testing if __name__ == "__main__": # Example usage classifier = QuestionClassifier() test_questions = [ "What is 25 + 37?", "Convert 100 fahrenheit to celsius", "How many continents are there?", "Who is the president of France?", "What albums did Mercedes Sosa release between 2000 and 2009?", "Calculate 15% of 200", "What is the capital of Japan?" ] print("Question Classification Examples:") print("=" * 50) for question in test_questions: classification, confidence, scores = classifier.classify_with_confidence(question) print(f"Q: {question}") print(f"Classification: {classification} (confidence: {confidence})") print(f"Scores: {scores}") print("-" * 30)