azettl's picture
remove google scholar
6d0f82e
"""
Base class for all research tools
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import time
import re
from datetime import datetime
class BaseTool(ABC):
"""Base class for all research tools"""
def __init__(self, name: str, description: str):
self.name = name
self.description = description
self.last_request_time = 0
self.rate_limit_delay = 1.0 # seconds between requests
@abstractmethod
def search(self, query: str, **kwargs) -> str:
"""Main search method - must be implemented by subclasses"""
pass
def rate_limit(self):
"""Simple rate limiting to be respectful to APIs"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.rate_limit_delay:
time.sleep(self.rate_limit_delay - time_since_last)
self.last_request_time = time.time()
def score_research_quality(self, research_result: str, source: str = "web") -> Dict[str, float]:
"""Score research based on multiple quality indicators"""
quality_score = {
"recency": self._check_recency(research_result),
"authority": self._check_authority(research_result, source),
"specificity": self._check_specificity(research_result),
"relevance": self._check_relevance(research_result),
"overall": 0.0
}
# Weighted overall score
weights = {"recency": 0.2, "authority": 0.3, "specificity": 0.3, "relevance": 0.2}
quality_score["overall"] = sum(quality_score[metric] * weight for metric, weight in weights.items())
return quality_score
def _check_recency(self, text: str) -> float:
"""Check for recent dates and current information"""
if not text:
return 0.3
# Look for years
years = re.findall(r'\b(20\d{2})\b', text)
if years:
latest_year = max(int(year) for year in years)
current_year = datetime.now().year
recency = max(0, 1 - (current_year - latest_year) / 10) # Decay over 10 years
return recency
return 0.3 # Default for no date found
def _check_authority(self, text: str, source: str) -> float:
"""Check source authority and credibility indicators"""
authority_indicators = {
'arxiv': 0.9,
'sec': 0.95,
'github': 0.7,
'wikipedia': 0.8,
'web': 0.5
}
base_score = authority_indicators.get(source.lower(), 0.5)
# Look for credibility markers in text
if text:
credibility_markers = ['study', 'research', 'university', 'published', 'peer-reviewed', 'official']
marker_count = sum(1 for marker in credibility_markers if marker in text.lower())
credibility_boost = min(0.3, marker_count * 0.05)
base_score += credibility_boost
return min(1.0, base_score)
def _check_specificity(self, text: str) -> float:
"""Check for specific data points and quantitative information"""
if not text:
return 0.1
# Count numbers, percentages, specific metrics
numbers = len(re.findall(r'\b\d+(?:\.\d+)?%?\b', text))
specific_terms = len(re.findall(r'\b(?:exactly|precisely|specifically|measured|calculated)\b', text, re.IGNORECASE))
specificity = min(1.0, (numbers * 0.02) + (specific_terms * 0.1))
return max(0.1, specificity) # Minimum baseline
def _check_relevance(self, text: str) -> float:
"""Check relevance to query (simplified implementation)"""
# This would ideally use the original query for comparison
# For now, return a baseline that could be enhanced
return 0.7 # Placeholder - could be enhanced with query matching
def should_use_for_query(self, query: str) -> bool:
"""Determine if this tool should be used for the given query"""
# Default implementation - override in subclasses for smart routing
return True
def extract_key_info(self, text: str) -> Dict[str, Any]:
"""Extract key information from research results"""
if not text:
return {}
return {
'length': len(text),
'has_numbers': bool(re.search(r'\d+', text)),
'has_dates': bool(re.search(r'\b20\d{2}\b', text)),
'has_urls': bool(re.search(r'http[s]?://', text))
}
def format_error_response(self, query: str, error: str) -> str:
"""Format a consistent error response"""
return f"**{self.name} Research for: {query}**\n\nResearch temporarily unavailable: {str(error)[:100]}..."