File size: 4,901 Bytes
ce0bf87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Base class for all research tools
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import time
import re
from datetime import datetime


class BaseTool(ABC):
    """Base class for all research tools"""
    
    def __init__(self, name: str, description: str):
        self.name = name
        self.description = description
        self.last_request_time = 0
        self.rate_limit_delay = 1.0  # seconds between requests
    
    @abstractmethod
    def search(self, query: str, **kwargs) -> str:
        """Main search method - must be implemented by subclasses"""
        pass
    
    def rate_limit(self):
        """Simple rate limiting to be respectful to APIs"""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        if time_since_last < self.rate_limit_delay:
            time.sleep(self.rate_limit_delay - time_since_last)
        self.last_request_time = time.time()
    
    def score_research_quality(self, research_result: str, source: str = "web") -> Dict[str, float]:
        """Score research based on multiple quality indicators"""
        
        quality_score = {
            "recency": self._check_recency(research_result),
            "authority": self._check_authority(research_result, source),
            "specificity": self._check_specificity(research_result),
            "relevance": self._check_relevance(research_result),
            "overall": 0.0
        }
        
        # Weighted overall score
        weights = {"recency": 0.2, "authority": 0.3, "specificity": 0.3, "relevance": 0.2}
        quality_score["overall"] = sum(quality_score[metric] * weight for metric, weight in weights.items())
        
        return quality_score
    
    def _check_recency(self, text: str) -> float:
        """Check for recent dates and current information"""
        if not text:
            return 0.3
            
        # Look for years
        years = re.findall(r'\b(20\d{2})\b', text)
        if years:
            latest_year = max(int(year) for year in years)
            current_year = datetime.now().year
            recency = max(0, 1 - (current_year - latest_year) / 10)  # Decay over 10 years
            return recency
        return 0.3  # Default for no date found
    
    def _check_authority(self, text: str, source: str) -> float:
        """Check source authority and credibility indicators"""
        authority_indicators = {
            'arxiv': 0.9,
            'sec': 0.95,
            'github': 0.7,
            'wikipedia': 0.8,
            'web': 0.5
        }
        
        base_score = authority_indicators.get(source.lower(), 0.5)
        
        # Look for credibility markers in text
        if text:
            credibility_markers = ['study', 'research', 'university', 'published', 'peer-reviewed', 'official']
            marker_count = sum(1 for marker in credibility_markers if marker in text.lower())
            credibility_boost = min(0.3, marker_count * 0.05)
            base_score += credibility_boost
        
        return min(1.0, base_score)
    
    def _check_specificity(self, text: str) -> float:
        """Check for specific data points and quantitative information"""
        if not text:
            return 0.1
            
        # Count numbers, percentages, specific metrics
        numbers = len(re.findall(r'\b\d+(?:\.\d+)?%?\b', text))
        specific_terms = len(re.findall(r'\b(?:exactly|precisely|specifically|measured|calculated)\b', text, re.IGNORECASE))
        
        specificity = min(1.0, (numbers * 0.02) + (specific_terms * 0.1))
        return max(0.1, specificity)  # Minimum baseline
    
    def _check_relevance(self, text: str) -> float:
        """Check relevance to query (simplified implementation)"""
        # This would ideally use the original query for comparison
        # For now, return a baseline that could be enhanced
        return 0.7  # Placeholder - could be enhanced with query matching
    
    def should_use_for_query(self, query: str) -> bool:
        """Determine if this tool should be used for the given query"""
        # Default implementation - override in subclasses for smart routing
        return True
    
    def extract_key_info(self, text: str) -> Dict[str, Any]:
        """Extract key information from research results"""
        if not text:
            return {}
            
        return {
            'length': len(text),
            'has_numbers': bool(re.search(r'\d+', text)),
            'has_dates': bool(re.search(r'\b20\d{2}\b', text)),
            'has_urls': bool(re.search(r'http[s]?://', text))
        }
    
    def format_error_response(self, query: str, error: str) -> str:
        """Format a consistent error response"""
        return f"**{self.name} Research for: {query}**\n\nResearch temporarily unavailable: {str(error)[:100]}..."