File size: 7,483 Bytes
ce0bf87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""
arXiv Academic Papers Search Tool
"""
from .base_tool import BaseTool
import requests
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from urllib.parse import quote
class ArxivSearchTool(BaseTool):
"""Search arXiv for academic papers and research"""
def __init__(self):
super().__init__("arXiv", "Search academic papers and research on arXiv")
self.base_url = "http://export.arxiv.org/api/query"
self.rate_limit_delay = 2.0 # Be respectful to arXiv
def search(self, query: str, max_results: int = 5, **kwargs) -> str:
"""Search arXiv for academic papers"""
self.rate_limit()
try:
# Prepare search parameters
params = {
'search_query': f'all:{query}',
'start': 0,
'max_results': max_results,
'sortBy': 'relevance',
'sortOrder': 'descending'
}
# Make request with better error handling
response = requests.get(self.base_url, params=params, timeout=20,
headers={'User-Agent': 'Research Tool (research@academic.edu)'})
response.raise_for_status()
# Parse XML response
root = ET.fromstring(response.content)
# Extract paper information
papers = []
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
paper = self._parse_arxiv_entry(entry)
if paper:
papers.append(paper)
# Format results
if papers:
result = f"**arXiv Academic Research for: {query}**\n\n"
for i, paper in enumerate(papers, 1):
result += f"**Paper {i}: {paper['title']}**\n"
result += f"Authors: {paper['authors']}\n"
result += f"Published: {paper['published']}\n"
result += f"Category: {paper.get('category', 'Unknown')}\n"
result += f"Abstract: {paper['abstract'][:400]}...\n"
result += f"Link: {paper['link']}\n\n"
# Add research quality assessment
result += self._assess_arxiv_quality(papers)
return result
else:
return f"**arXiv Research for: {query}**\n\nNo relevant academic papers found on arXiv."
except requests.Timeout:
return f"**arXiv Research for: {query}**\n\nRequest timeout - arXiv may be experiencing high load. Research available but slower than expected."
except requests.ConnectionError as e:
if "Connection reset" in str(e):
return f"**arXiv Research for: {query}**\n\nConnection reset by arXiv server - this is common due to rate limiting. Academic research is available but temporarily throttled."
return self.format_error_response(query, f"Connection error: {str(e)}")
except requests.RequestException as e:
return self.format_error_response(query, f"Network error accessing arXiv: {str(e)}")
except ET.ParseError as e:
return self.format_error_response(query, f"Error parsing arXiv response: {str(e)}")
except Exception as e:
return self.format_error_response(query, str(e))
def _parse_arxiv_entry(self, entry) -> Optional[Dict[str, str]]:
"""Parse individual arXiv entry"""
try:
ns = {'atom': 'http://www.w3.org/2005/Atom'}
title = entry.find('atom:title', ns)
title_text = title.text.strip().replace('\n', ' ') if title is not None else "Unknown Title"
authors = entry.findall('atom:author/atom:name', ns)
author_names = [author.text for author in authors] if authors else ["Unknown Author"]
published = entry.find('atom:published', ns)
published_text = published.text[:10] if published is not None else "Unknown Date" # YYYY-MM-DD
summary = entry.find('atom:summary', ns)
abstract = summary.text.strip().replace('\n', ' ') if summary is not None else "No abstract available"
link = entry.find('atom:id', ns)
link_url = link.text if link is not None else ""
# Extract category
categories = entry.findall('atom:category', ns)
category = categories[0].get('term') if categories else "Unknown"
return {
'title': title_text,
'authors': ', '.join(author_names[:3]), # Limit to first 3 authors
'published': published_text,
'abstract': abstract,
'link': link_url,
'category': category
}
except Exception as e:
print(f"Error parsing arXiv entry: {e}")
return None
def _assess_arxiv_quality(self, papers: List[Dict]) -> str:
"""Assess the quality of arXiv search results"""
if not papers:
return ""
# Calculate average recency
current_year = 2025
recent_papers = sum(1 for paper in papers if paper['published'].startswith(('2024', '2025')))
quality_assessment = f"**Research Quality Assessment:**\n"
quality_assessment += f"• Papers found: {len(papers)}\n"
quality_assessment += f"• Recent papers (2024-2025): {recent_papers}/{len(papers)}\n"
# Check for high-impact categories
categories = [paper.get('category', '') for paper in papers]
ml_ai_papers = sum(1 for cat in categories if any(term in cat.lower() for term in ['cs.ai', 'cs.lg', 'cs.cv', 'stat.ml']))
if ml_ai_papers > 0:
quality_assessment += f"• AI/ML papers: {ml_ai_papers}\n"
quality_assessment += f"• Authority level: High (peer-reviewed preprints)\n\n"
return quality_assessment
def should_use_for_query(self, query: str) -> bool:
"""arXiv is good for scientific, technical, and research-oriented queries"""
academic_indicators = [
'research', 'study', 'analysis', 'scientific', 'algorithm', 'method',
'machine learning', 'ai', 'artificial intelligence', 'deep learning',
'neural network', 'computer science', 'physics', 'mathematics',
'quantum', 'cryptography', 'blockchain', 'paper', 'academic'
]
query_lower = query.lower()
return any(indicator in query_lower for indicator in academic_indicators)
def extract_key_info(self, text: str) -> dict:
"""Extract key information from arXiv results"""
base_info = super().extract_key_info(text)
if text:
# Look for arXiv-specific patterns
base_info.update({
'paper_count': text.count('**Paper'),
'has_abstracts': 'Abstract:' in text,
'has_recent_papers': any(year in text for year in ['2024', '2025']),
'has_ai_ml': any(term in text.lower() for term in ['machine learning', 'ai', 'neural', 'deep learning']),
'has_arxiv_links': 'arxiv.org' in text
})
return base_info |