File size: 7,483 Bytes
ce0bf87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
arXiv Academic Papers Search Tool
"""
from .base_tool import BaseTool
import requests
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from urllib.parse import quote


class ArxivSearchTool(BaseTool):
    """Search arXiv for academic papers and research"""
    
    def __init__(self):
        super().__init__("arXiv", "Search academic papers and research on arXiv")
        self.base_url = "http://export.arxiv.org/api/query"
        self.rate_limit_delay = 2.0  # Be respectful to arXiv
    
    def search(self, query: str, max_results: int = 5, **kwargs) -> str:
        """Search arXiv for academic papers"""
        self.rate_limit()
        
        try:
            # Prepare search parameters
            params = {
                'search_query': f'all:{query}',
                'start': 0,
                'max_results': max_results,
                'sortBy': 'relevance',
                'sortOrder': 'descending'
            }
            
            # Make request with better error handling
            response = requests.get(self.base_url, params=params, timeout=20, 
                                  headers={'User-Agent': 'Research Tool (research@academic.edu)'})
            response.raise_for_status()
            
            # Parse XML response
            root = ET.fromstring(response.content)
            
            # Extract paper information
            papers = []
            for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
                paper = self._parse_arxiv_entry(entry)
                if paper:
                    papers.append(paper)
            
            # Format results
            if papers:
                result = f"**arXiv Academic Research for: {query}**\n\n"
                for i, paper in enumerate(papers, 1):
                    result += f"**Paper {i}: {paper['title']}**\n"
                    result += f"Authors: {paper['authors']}\n"
                    result += f"Published: {paper['published']}\n"
                    result += f"Category: {paper.get('category', 'Unknown')}\n"
                    result += f"Abstract: {paper['abstract'][:400]}...\n"
                    result += f"Link: {paper['link']}\n\n"
                
                # Add research quality assessment
                result += self._assess_arxiv_quality(papers)
                
                return result
            else:
                return f"**arXiv Research for: {query}**\n\nNo relevant academic papers found on arXiv."
                
        except requests.Timeout:
            return f"**arXiv Research for: {query}**\n\nRequest timeout - arXiv may be experiencing high load. Research available but slower than expected."
        except requests.ConnectionError as e:
            if "Connection reset" in str(e):
                return f"**arXiv Research for: {query}**\n\nConnection reset by arXiv server - this is common due to rate limiting. Academic research is available but temporarily throttled."
            return self.format_error_response(query, f"Connection error: {str(e)}")
        except requests.RequestException as e:
            return self.format_error_response(query, f"Network error accessing arXiv: {str(e)}")
        except ET.ParseError as e:
            return self.format_error_response(query, f"Error parsing arXiv response: {str(e)}")
        except Exception as e:
            return self.format_error_response(query, str(e))
    
    def _parse_arxiv_entry(self, entry) -> Optional[Dict[str, str]]:
        """Parse individual arXiv entry"""
        try:
            ns = {'atom': 'http://www.w3.org/2005/Atom'}
            
            title = entry.find('atom:title', ns)
            title_text = title.text.strip().replace('\n', ' ') if title is not None else "Unknown Title"
            
            authors = entry.findall('atom:author/atom:name', ns)
            author_names = [author.text for author in authors] if authors else ["Unknown Author"]
            
            published = entry.find('atom:published', ns)
            published_text = published.text[:10] if published is not None else "Unknown Date"  # YYYY-MM-DD
            
            summary = entry.find('atom:summary', ns)
            abstract = summary.text.strip().replace('\n', ' ') if summary is not None else "No abstract available"
            
            link = entry.find('atom:id', ns)
            link_url = link.text if link is not None else ""
            
            # Extract category
            categories = entry.findall('atom:category', ns)
            category = categories[0].get('term') if categories else "Unknown"
            
            return {
                'title': title_text,
                'authors': ', '.join(author_names[:3]),  # Limit to first 3 authors
                'published': published_text,
                'abstract': abstract,
                'link': link_url,
                'category': category
            }
        except Exception as e:
            print(f"Error parsing arXiv entry: {e}")
            return None
    
    def _assess_arxiv_quality(self, papers: List[Dict]) -> str:
        """Assess the quality of arXiv search results"""
        if not papers:
            return ""
        
        # Calculate average recency
        current_year = 2025
        recent_papers = sum(1 for paper in papers if paper['published'].startswith(('2024', '2025')))
        
        quality_assessment = f"**Research Quality Assessment:**\n"
        quality_assessment += f"• Papers found: {len(papers)}\n"
        quality_assessment += f"• Recent papers (2024-2025): {recent_papers}/{len(papers)}\n"
        
        # Check for high-impact categories
        categories = [paper.get('category', '') for paper in papers]
        ml_ai_papers = sum(1 for cat in categories if any(term in cat.lower() for term in ['cs.ai', 'cs.lg', 'cs.cv', 'stat.ml']))
        if ml_ai_papers > 0:
            quality_assessment += f"• AI/ML papers: {ml_ai_papers}\n"
        
        quality_assessment += f"• Authority level: High (peer-reviewed preprints)\n\n"
        
        return quality_assessment
    
    def should_use_for_query(self, query: str) -> bool:
        """arXiv is good for scientific, technical, and research-oriented queries"""
        academic_indicators = [
            'research', 'study', 'analysis', 'scientific', 'algorithm', 'method',
            'machine learning', 'ai', 'artificial intelligence', 'deep learning',
            'neural network', 'computer science', 'physics', 'mathematics',
            'quantum', 'cryptography', 'blockchain', 'paper', 'academic'
        ]
        
        query_lower = query.lower()
        return any(indicator in query_lower for indicator in academic_indicators)
    
    def extract_key_info(self, text: str) -> dict:
        """Extract key information from arXiv results"""
        base_info = super().extract_key_info(text)
        
        if text:
            # Look for arXiv-specific patterns
            base_info.update({
                'paper_count': text.count('**Paper'),
                'has_abstracts': 'Abstract:' in text,
                'has_recent_papers': any(year in text for year in ['2024', '2025']),
                'has_ai_ml': any(term in text.lower() for term in ['machine learning', 'ai', 'neural', 'deep learning']),
                'has_arxiv_links': 'arxiv.org' in text
            })
        
        return base_info