Spaces:
Running
Running
GitsSaikat
commited on
Commit
·
3290198
0
Parent(s):
first commit
Browse files- DeepResearch_App/README.md +61 -0
- DeepResearch_App/app.py +205 -0
- DeepResearch_App/logo.png +0 -0
- DeepResearch_App/requirements.txt +4 -0
- DeepResearch_App/research/__pycache__/async_research.cpython-312.pyc +0 -0
- DeepResearch_App/research/__pycache__/deep_research.cpython-312.pyc +0 -0
- DeepResearch_App/research/deep_research.py +355 -0
DeepResearch_App/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔍 Open DeepResearch
|
2 |
+
|
3 |
+

|
4 |
+
|
5 |
+
## Overview
|
6 |
+
|
7 |
+
Open DeepResearch is a powerful research assistant that leverages AI to conduct comprehensive research on any topic. It automates the process of gathering, analyzing, and synthesizing information from multiple sources to generate detailed reports with proper citations.
|
8 |
+
|
9 |
+
## Features
|
10 |
+
|
11 |
+
- 🤖 AI-powered search query generation
|
12 |
+
- 🌐 Automated web searching and content extraction
|
13 |
+
- 📊 Smart relevance filtering
|
14 |
+
- 📝 Comprehensive report generation with citations
|
15 |
+
- 🔄 Iterative research refinement
|
16 |
+
- 📱 User-friendly Streamlit interface
|
17 |
+
|
18 |
+
## Requirements
|
19 |
+
|
20 |
+
You'll need API keys from:
|
21 |
+
- [OpenRouter](https://openrouter.ai/keys)
|
22 |
+
- [SerpAPI](https://serpapi.com/manage-api-key)
|
23 |
+
- [Jina](https://jina.ai/api-key)
|
24 |
+
|
25 |
+
## Installation
|
26 |
+
|
27 |
+
```bash
|
28 |
+
git clone https://github.com/yourusername/DeepResearch_App.git
|
29 |
+
cd DeepResearch_App
|
30 |
+
pip install -r requirements.txt
|
31 |
+
```
|
32 |
+
|
33 |
+
## Usage
|
34 |
+
|
35 |
+
1. Launch the app:
|
36 |
+
```bash
|
37 |
+
streamlit run app.py
|
38 |
+
```
|
39 |
+
|
40 |
+
2. Configure your API keys in the sidebar
|
41 |
+
3. Enter your research query
|
42 |
+
4. Set the number of research iterations
|
43 |
+
5. Click "Start Research" and wait for your detailed report
|
44 |
+
|
45 |
+
## How It Works
|
46 |
+
|
47 |
+
1. **Query Generation**: AI creates targeted search queries based on your topic
|
48 |
+
2. **Web Search**: Automated search across multiple sources
|
49 |
+
3. **Content Analysis**: Relevant information extraction and filtering
|
50 |
+
4. **Report Generation**: Synthesized findings with proper citations
|
51 |
+
5. **Iterative Refinement**: Additional searches based on gaps in information
|
52 |
+
|
53 |
+
## Contributing
|
54 |
+
|
55 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
56 |
+
|
57 |
+
## License
|
58 |
+
|
59 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
60 |
+
|
61 |
+
|
DeepResearch_App/app.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import asyncio
|
3 |
+
from research import deep_research
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
# Page configuration
|
7 |
+
st.set_page_config(
|
8 |
+
page_title="Open DeepResearch",
|
9 |
+
page_icon="🔍",
|
10 |
+
layout="wide",
|
11 |
+
initial_sidebar_state="expanded"
|
12 |
+
)
|
13 |
+
|
14 |
+
# Load and display logo in sidebar
|
15 |
+
logo = Image.open('logo.png')
|
16 |
+
st.sidebar.image(logo, width=200, use_container_width=True)
|
17 |
+
|
18 |
+
# Initialize session state for API keys
|
19 |
+
if 'api_keys_configured' not in st.session_state:
|
20 |
+
st.session_state.api_keys_configured = False
|
21 |
+
|
22 |
+
# Custom CSS (previous CSS remains the same)
|
23 |
+
st.markdown("""
|
24 |
+
<style>
|
25 |
+
/* ... previous CSS ... */
|
26 |
+
.api-container {
|
27 |
+
background-color: #f8f9fa;
|
28 |
+
padding: 1.5rem;
|
29 |
+
border-radius: 10px;
|
30 |
+
margin-bottom: 2rem;
|
31 |
+
border: 1px solid #e0e0e0;
|
32 |
+
}
|
33 |
+
.api-header {
|
34 |
+
color: #1E88E5;
|
35 |
+
font-size: 1.2rem;
|
36 |
+
margin-bottom: 1rem;
|
37 |
+
}
|
38 |
+
</style>
|
39 |
+
""", unsafe_allow_html=True)
|
40 |
+
|
41 |
+
# Sidebar for API Configuration
|
42 |
+
with st.sidebar:
|
43 |
+
st.markdown("### ⚙️ API Configuration")
|
44 |
+
st.info("Please configure your API keys before starting research.")
|
45 |
+
|
46 |
+
with st.expander("Configure API Keys", expanded=not st.session_state.api_keys_configured):
|
47 |
+
api_form = st.form("api_keys_form")
|
48 |
+
with api_form:
|
49 |
+
openrouter_key = api_form.text_input(
|
50 |
+
"OpenRouter API Key",
|
51 |
+
type="password",
|
52 |
+
value=st.session_state.get('openrouter_key', ''),
|
53 |
+
help="Required for language model access"
|
54 |
+
)
|
55 |
+
|
56 |
+
serpapi_key = api_form.text_input(
|
57 |
+
"SerpAPI Key",
|
58 |
+
type="password",
|
59 |
+
value=st.session_state.get('serpapi_key', ''),
|
60 |
+
help="Required for web search functionality"
|
61 |
+
)
|
62 |
+
|
63 |
+
jina_key = api_form.text_input(
|
64 |
+
"Jina API Key",
|
65 |
+
type="password",
|
66 |
+
value=st.session_state.get('jina_key', ''),
|
67 |
+
help="Required for content extraction"
|
68 |
+
)
|
69 |
+
|
70 |
+
if api_form.form_submit_button("Save API Keys"):
|
71 |
+
if not all([openrouter_key, serpapi_key, jina_key]):
|
72 |
+
st.error("❌ All API keys are required!")
|
73 |
+
else:
|
74 |
+
# Store API keys in session state
|
75 |
+
st.session_state.openrouter_key = openrouter_key
|
76 |
+
st.session_state.serpapi_key = serpapi_key
|
77 |
+
st.session_state.jina_key = jina_key
|
78 |
+
st.session_state.api_keys_configured = True
|
79 |
+
st.success("✅ API keys saved successfully!")
|
80 |
+
st.rerun()
|
81 |
+
|
82 |
+
if st.session_state.api_keys_configured:
|
83 |
+
st.success("✅ API Keys configured")
|
84 |
+
|
85 |
+
# Add links to get API keys
|
86 |
+
st.markdown("### 🔑 Get API Keys")
|
87 |
+
st.markdown("""
|
88 |
+
- [OpenRouter API Key](https://openrouter.ai/keys)
|
89 |
+
- [SerpAPI Key](https://serpapi.com/manage-api-key)
|
90 |
+
- [Jina API Key](https://jina.ai/api-key)
|
91 |
+
""")
|
92 |
+
|
93 |
+
def run_research(user_query, iteration_limit):
|
94 |
+
# Set API keys in the research module
|
95 |
+
deep_research.OPENROUTER_API_KEY = st.session_state.openrouter_key
|
96 |
+
deep_research.SERPAPI_API_KEY = st.session_state.serpapi_key
|
97 |
+
deep_research.JINA_API_KEY = st.session_state.jina_key
|
98 |
+
return asyncio.run(deep_research.research_flow(user_query, iteration_limit))
|
99 |
+
|
100 |
+
# Main content
|
101 |
+
if not st.session_state.api_keys_configured:
|
102 |
+
st.warning("⚠️ Please configure your API keys in the sidebar before proceeding.")
|
103 |
+
else:
|
104 |
+
# Title and description
|
105 |
+
st.title("🔍 Open DeepResearch")
|
106 |
+
st.markdown("""
|
107 |
+
<div style='background-color: #e3f2fd; padding: 1rem; border-radius: 10px; margin-bottom: 2rem;'>
|
108 |
+
<h4 style='color: #1565C0; margin-bottom: 0.5rem;'>Welcome to the Open DeepResearch!</h4>
|
109 |
+
<p style='color: #424242;'>
|
110 |
+
This application helps you conduct comprehensive research on any topic by:
|
111 |
+
<br>
|
112 |
+
• Generating relevant search queries<br>
|
113 |
+
• Analyzing multiple sources<br>
|
114 |
+
• Synthesizing information into a detailed report
|
115 |
+
</p>
|
116 |
+
</div>
|
117 |
+
""", unsafe_allow_html=True)
|
118 |
+
|
119 |
+
# Main form in a container
|
120 |
+
with st.container():
|
121 |
+
col1, col2 = st.columns([2, 1])
|
122 |
+
|
123 |
+
with col1:
|
124 |
+
with st.form("research_form", clear_on_submit=False):
|
125 |
+
st.markdown("### Research Parameters")
|
126 |
+
|
127 |
+
user_query = st.text_area(
|
128 |
+
"Research Query",
|
129 |
+
placeholder="Enter your research topic or question here...",
|
130 |
+
help="Be as specific as possible for better results",
|
131 |
+
height=100
|
132 |
+
)
|
133 |
+
|
134 |
+
col_a, col_b = st.columns(2)
|
135 |
+
with col_a:
|
136 |
+
iter_limit_input = st.number_input(
|
137 |
+
"Maximum Iterations",
|
138 |
+
min_value=1,
|
139 |
+
max_value=20,
|
140 |
+
value=10,
|
141 |
+
help="Higher values mean more thorough research but longer processing time"
|
142 |
+
)
|
143 |
+
|
144 |
+
submitted = st.form_submit_button("🚀 Start Research")
|
145 |
+
|
146 |
+
with col2:
|
147 |
+
st.markdown("### Tips for Better Results")
|
148 |
+
st.info("""
|
149 |
+
• Be specific in your query
|
150 |
+
• Use clear, focused questions
|
151 |
+
• Consider including relevant keywords
|
152 |
+
• Specify time periods if applicable
|
153 |
+
""")
|
154 |
+
|
155 |
+
# Process and display results
|
156 |
+
if submitted:
|
157 |
+
if not user_query.strip():
|
158 |
+
st.error("⚠️ Please enter a research query before proceeding.")
|
159 |
+
else:
|
160 |
+
try:
|
161 |
+
with st.spinner("🔄 Conducting research... This may take a few minutes..."):
|
162 |
+
final_report = run_research(user_query, int(iter_limit_input))
|
163 |
+
|
164 |
+
st.markdown("""
|
165 |
+
<div class='report-container'>
|
166 |
+
<h3 style='color: #1E88E5; margin-bottom: 1rem;'>📊 Research Report</h3>
|
167 |
+
</div>
|
168 |
+
""", unsafe_allow_html=True)
|
169 |
+
|
170 |
+
# Display the report in tabs
|
171 |
+
tab1, tab2 = st.tabs(["📝 Formatted Report", "📄 Raw Text"])
|
172 |
+
|
173 |
+
with tab1:
|
174 |
+
st.markdown(final_report)
|
175 |
+
|
176 |
+
with tab2:
|
177 |
+
st.text_area(
|
178 |
+
label="",
|
179 |
+
value=final_report,
|
180 |
+
height=500,
|
181 |
+
help="You can copy the raw text from here"
|
182 |
+
)
|
183 |
+
|
184 |
+
# Download button for the report
|
185 |
+
st.download_button(
|
186 |
+
label="📥 Download Report",
|
187 |
+
data=final_report,
|
188 |
+
file_name="research_report.txt",
|
189 |
+
mime="text/plain"
|
190 |
+
)
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
st.error(f"❌ An error occurred during research: {str(e)}")
|
194 |
+
st.markdown("""
|
195 |
+
<div style='background-color: #ffebee; padding: 1rem; border-radius: 10px;'>
|
196 |
+
<p style='color: #c62828;'>Please try again with a different query or contact support if the issue persists.</p>
|
197 |
+
</div>
|
198 |
+
""", unsafe_allow_html=True)
|
199 |
+
|
200 |
+
# Footer
|
201 |
+
st.markdown("""
|
202 |
+
<div style='text-align: center; color: #666; padding: 2rem;'>
|
203 |
+
<p>Built by GitsSaikat ❤️</p>
|
204 |
+
</div>
|
205 |
+
""", unsafe_allow_html=True)
|
DeepResearch_App/logo.png
ADDED
![]() |
DeepResearch_App/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
aiohttp
|
3 |
+
nest_asyncio
|
4 |
+
json5
|
DeepResearch_App/research/__pycache__/async_research.cpython-312.pyc
ADDED
Binary file (19.2 kB). View file
|
|
DeepResearch_App/research/__pycache__/deep_research.cpython-312.pyc
ADDED
Binary file (18.4 kB). View file
|
|
DeepResearch_App/research/deep_research.py
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import aiohttp
|
3 |
+
import json
|
4 |
+
import nest_asyncio
|
5 |
+
nest_asyncio.apply()
|
6 |
+
|
7 |
+
# API Endpoints
|
8 |
+
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
9 |
+
SERPAPI_URL = "https://serpapi.com/search"
|
10 |
+
JINA_BASE_URL = "https://r.jina.ai/"
|
11 |
+
|
12 |
+
# Modify the default model selection
|
13 |
+
DEFAULT_MODEL = "google/gemini-2.0-flash-lite-preview-02-05:free" # Gemini Flash 2.0 model identifier
|
14 |
+
|
15 |
+
# Helper class to hold extracted content along with its source URL
|
16 |
+
class SourcedContext:
|
17 |
+
def __init__(self, text, source_url):
|
18 |
+
self.text = text
|
19 |
+
self.source_url = source_url
|
20 |
+
|
21 |
+
async def call_openrouter_async(session, messages, model=DEFAULT_MODEL):
|
22 |
+
"""
|
23 |
+
Make an asynchronous request to the OpenRouter chat completion API with the given messages.
|
24 |
+
Returns the assistant's reply text.
|
25 |
+
"""
|
26 |
+
headers = {
|
27 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
28 |
+
"HTTP-Referer": "https://github.com/Pygen",
|
29 |
+
"X-Title": "Research Assistant",
|
30 |
+
"Content-Type": "application/json"
|
31 |
+
}
|
32 |
+
|
33 |
+
payload = {
|
34 |
+
"model": model,
|
35 |
+
"messages": messages,
|
36 |
+
"temperature": 0.7,
|
37 |
+
"max_tokens": 4096
|
38 |
+
}
|
39 |
+
|
40 |
+
try:
|
41 |
+
async with session.post(OPENROUTER_URL, headers=headers, json=payload) as resp:
|
42 |
+
if resp.status == 200:
|
43 |
+
result = await resp.json()
|
44 |
+
try:
|
45 |
+
return result['choices'][0]['message']['content']
|
46 |
+
except (KeyError, IndexError) as e:
|
47 |
+
print("Unexpected response structure from OpenRouter:", result)
|
48 |
+
return None
|
49 |
+
else:
|
50 |
+
text = await resp.text()
|
51 |
+
print(f"OpenRouter API error: {resp.status} - {text}")
|
52 |
+
return None
|
53 |
+
except Exception as e:
|
54 |
+
print("Error during OpenRouter call:", e)
|
55 |
+
return None
|
56 |
+
|
57 |
+
async def generate_search_queries_async(session, user_query):
|
58 |
+
"""
|
59 |
+
Use the LLM to produce up to four clear search queries based on the user's topic.
|
60 |
+
"""
|
61 |
+
prompt = (
|
62 |
+
"You are a seasoned research assistant. Based on the user's topic, produce as many as four distinct and precise "
|
63 |
+
"search queries that will help collect thorough information on the subject. "
|
64 |
+
"Return a Python list of strings only, without any code formatting or backticks. "
|
65 |
+
"For example: ['query1', 'query2', 'query3']"
|
66 |
+
)
|
67 |
+
messages = [
|
68 |
+
{"role": "system", "content": "You are a precise and supportive research assistant."},
|
69 |
+
{"role": "user", "content": f"User Topic: {user_query}\n\n{prompt}"}
|
70 |
+
]
|
71 |
+
response = await call_openrouter_async(session, messages)
|
72 |
+
if response:
|
73 |
+
try:
|
74 |
+
cleaned_response = response.strip()
|
75 |
+
if cleaned_response.startswith("```"):
|
76 |
+
cleaned_response = cleaned_response.split("```")[1]
|
77 |
+
if cleaned_response.startswith("python"):
|
78 |
+
cleaned_response = cleaned_response[6:]
|
79 |
+
cleaned_response = cleaned_response.strip()
|
80 |
+
|
81 |
+
search_queries = eval(cleaned_response)
|
82 |
+
if isinstance(search_queries, list):
|
83 |
+
return search_queries
|
84 |
+
else:
|
85 |
+
print("The LLM response is not a list. Response:", response)
|
86 |
+
return []
|
87 |
+
except Exception as e:
|
88 |
+
print("Error interpreting search queries:", e, "\nResponse:", response)
|
89 |
+
return []
|
90 |
+
return []
|
91 |
+
|
92 |
+
async def perform_search_async(session, query):
|
93 |
+
"""
|
94 |
+
Make an asynchronous SERPAPI call to perform a Google search for the provided query.
|
95 |
+
"""
|
96 |
+
params = {
|
97 |
+
"q": query,
|
98 |
+
"api_key": SERPAPI_API_KEY,
|
99 |
+
"engine": "google"
|
100 |
+
}
|
101 |
+
try:
|
102 |
+
async with session.get(SERPAPI_URL, params=params) as resp:
|
103 |
+
if resp.status == 200:
|
104 |
+
results = await resp.json()
|
105 |
+
if "organic_results" in results:
|
106 |
+
links = [item.get("link") for item in results["organic_results"] if "link" in item]
|
107 |
+
return links
|
108 |
+
else:
|
109 |
+
print("No organic results found in SERPAPI response.")
|
110 |
+
return []
|
111 |
+
else:
|
112 |
+
text = await resp.text()
|
113 |
+
print(f"SERPAPI error: {resp.status} - {text}")
|
114 |
+
return []
|
115 |
+
except Exception as e:
|
116 |
+
print("Error during SERPAPI search:", e)
|
117 |
+
return []
|
118 |
+
|
119 |
+
async def fetch_webpage_text_async(session, url):
|
120 |
+
"""
|
121 |
+
Fetch the textual content of a webpage asynchronously using the Jina service.
|
122 |
+
"""
|
123 |
+
full_url = f"{JINA_BASE_URL}{url}"
|
124 |
+
headers = {
|
125 |
+
"Authorization": f"Bearer {JINA_API_KEY}"
|
126 |
+
}
|
127 |
+
try:
|
128 |
+
async with session.get(full_url, headers=headers) as resp:
|
129 |
+
if resp.status == 200:
|
130 |
+
return await resp.text()
|
131 |
+
else:
|
132 |
+
text = await resp.text()
|
133 |
+
print(f"Jina fetch error for {url}: {resp.status} - {text}")
|
134 |
+
return ""
|
135 |
+
except Exception as e:
|
136 |
+
print("Error retrieving webpage text with Jina:", e)
|
137 |
+
return ""
|
138 |
+
|
139 |
+
async def is_page_useful_async(session, user_query, page_text):
|
140 |
+
"""
|
141 |
+
Request the LLM to determine if the provided webpage content is pertinent to answering the user's topic.
|
142 |
+
"""
|
143 |
+
prompt = (
|
144 |
+
"You are a discerning evaluator of research. Given the user's topic and a snippet of webpage content, "
|
145 |
+
"decide if the page contains valuable information to address the query. "
|
146 |
+
"Reply strictly with one word: 'Yes' if the content is useful, or 'No' if it is not. Provide no extra text."
|
147 |
+
)
|
148 |
+
messages = [
|
149 |
+
{"role": "system", "content": "You are a concise and strict research relevance evaluator."},
|
150 |
+
{"role": "user", "content": f"User Topic: {user_query}\n\nWebpage Snippet (up to 20000 characters):\n{page_text[:20000]}\n\n{prompt}"}
|
151 |
+
]
|
152 |
+
response = await call_openrouter_async(session, messages)
|
153 |
+
if response:
|
154 |
+
answer = response.strip()
|
155 |
+
if answer in ["Yes", "No"]:
|
156 |
+
return answer
|
157 |
+
else:
|
158 |
+
if "Yes" in answer:
|
159 |
+
return "Yes"
|
160 |
+
elif "No" in answer:
|
161 |
+
return "No"
|
162 |
+
return "No"
|
163 |
+
|
164 |
+
async def extract_relevant_context_async(session, user_query, search_query, page_text):
|
165 |
+
"""
|
166 |
+
Derive and return the important details from the webpage text to address the user's topic.
|
167 |
+
"""
|
168 |
+
prompt = (
|
169 |
+
"You are an expert extractor of information. Given the user's topic, the search query that produced this page, "
|
170 |
+
"and the webpage text, extract all pertinent details needed to answer the inquiry. "
|
171 |
+
"Return only the relevant text without any additional commentary."
|
172 |
+
)
|
173 |
+
messages = [
|
174 |
+
{"role": "system", "content": "You excel at summarizing and extracting relevant details."},
|
175 |
+
{"role": "user", "content": f"User Topic: {user_query}\nSearch Query: {search_query}\n\nWebpage Snippet (up to 20000 characters):\n{page_text[:20000]}\n\n{prompt}"}
|
176 |
+
]
|
177 |
+
response = await call_openrouter_async(session, messages)
|
178 |
+
if response:
|
179 |
+
return response.strip()
|
180 |
+
return ""
|
181 |
+
|
182 |
+
async def get_new_search_queries_async(session, user_query, previous_search_queries, all_contexts):
|
183 |
+
"""
|
184 |
+
Evaluate if additional search queries are necessary based on the current research progress.
|
185 |
+
"""
|
186 |
+
context_combined = "\n".join(all_contexts)
|
187 |
+
prompt = (
|
188 |
+
"You are a systematic research planner. Taking into account the original topic, prior search queries, "
|
189 |
+
"and the extracted information from webpages, determine if more research is required. "
|
190 |
+
"If so, produce up to four new search queries as a Python list "
|
191 |
+
"(for example: ['new query1', 'new query2']). If no further research is needed, reply with an empty string."
|
192 |
+
"\nReturn only a Python list or an empty string without extra commentary."
|
193 |
+
)
|
194 |
+
messages = [
|
195 |
+
{"role": "system", "content": "You are methodical in planning further research steps."},
|
196 |
+
{"role": "user", "content": f"User Topic: {user_query}\nPrevious Queries: {previous_search_queries}\n\nCollected Context:\n{context_combined}\n\n{prompt}"}
|
197 |
+
]
|
198 |
+
response = await call_openrouter_async(session, messages)
|
199 |
+
if response:
|
200 |
+
cleaned = response.strip()
|
201 |
+
if cleaned == "":
|
202 |
+
return ""
|
203 |
+
try:
|
204 |
+
if cleaned.startswith("```"):
|
205 |
+
cleaned = cleaned.split("```")[1]
|
206 |
+
if cleaned.startswith("python"):
|
207 |
+
cleaned = cleaned[6:]
|
208 |
+
cleaned = cleaned.strip()
|
209 |
+
new_queries = eval(cleaned)
|
210 |
+
if isinstance(new_queries, list):
|
211 |
+
return new_queries
|
212 |
+
else:
|
213 |
+
print("LLM response is not a list for extra search queries. Response:", response)
|
214 |
+
return []
|
215 |
+
except Exception as e:
|
216 |
+
print("Failed to parse additional search queries:", e, "\nResponse:", response)
|
217 |
+
return []
|
218 |
+
return []
|
219 |
+
|
220 |
+
async def generate_final_report_async(session, user_query, sourced_contexts):
|
221 |
+
"""
|
222 |
+
Construct the ultimate detailed report including proper citations and references.
|
223 |
+
"""
|
224 |
+
# Assign citation numbers to contexts based on source URL
|
225 |
+
references = {}
|
226 |
+
ref_number = 1
|
227 |
+
formatted_contexts = []
|
228 |
+
|
229 |
+
for ctx in sourced_contexts:
|
230 |
+
if ctx.source_url not in references:
|
231 |
+
references[ctx.source_url] = ref_number
|
232 |
+
ref_number += 1
|
233 |
+
formatted_contexts.append(f"{ctx.text} [{references[ctx.source_url]}]")
|
234 |
+
|
235 |
+
context_combined = "\n".join(formatted_contexts)
|
236 |
+
|
237 |
+
# Build the reference section
|
238 |
+
reference_list = [f"[{num}] {url}" for url, num in sorted(references.items(), key=lambda x: x[1])]
|
239 |
+
reference_section = "\n\nReferences:\n" + "\n".join(reference_list)
|
240 |
+
|
241 |
+
prompt = (
|
242 |
+
"You are a proficient academic report writer. Using the compiled contexts below and the original topic, "
|
243 |
+
"compose a comprehensive, well-organized, and in-depth report that fully addresses the inquiry. "
|
244 |
+
"Ensure that each piece of evidence is tagged with citation numbers in square brackets (e.g., [1], [2]). "
|
245 |
+
"Maintain these tags in your final report to show the references. "
|
246 |
+
"The style should be academic with proper in-text citations. Do not alter or add citation numbers."
|
247 |
+
)
|
248 |
+
|
249 |
+
messages = [
|
250 |
+
{"role": "system", "content": "You are an expert academic report composer."},
|
251 |
+
{"role": "user", "content": f"User Topic: {user_query}\n\nCollected Context:\n{context_combined}\n\n{prompt}"}
|
252 |
+
]
|
253 |
+
|
254 |
+
report = await call_openrouter_async(session, messages)
|
255 |
+
if report:
|
256 |
+
return report + reference_section
|
257 |
+
return "Error occurred while generating the report."
|
258 |
+
|
259 |
+
async def process_link(session, link, user_query, search_query):
|
260 |
+
"""
|
261 |
+
Handle a single URL: fetch its content, assess its relevance, and if it qualifies, extract the associated context.
|
262 |
+
Returns a SourcedContext object upon success, or None otherwise.
|
263 |
+
"""
|
264 |
+
print(f"Retrieving content from: {link}")
|
265 |
+
page_text = await fetch_webpage_text_async(session, link)
|
266 |
+
if not page_text:
|
267 |
+
return None
|
268 |
+
usefulness = await is_page_useful_async(session, user_query, page_text)
|
269 |
+
print(f"Relevance of {link}: {usefulness}")
|
270 |
+
if usefulness == "Yes":
|
271 |
+
context = await extract_relevant_context_async(session, user_query, search_query, page_text)
|
272 |
+
if context:
|
273 |
+
print(f"Context extracted from {link} (first 200 characters): {context[:200]}")
|
274 |
+
return SourcedContext(context, link)
|
275 |
+
return None
|
276 |
+
|
277 |
+
async def research_flow(user_query, iteration_limit):
|
278 |
+
"""
|
279 |
+
Primary research procedure intended for integration with Streamlit.
|
280 |
+
"""
|
281 |
+
sourced_contexts = []
|
282 |
+
all_search_queries = []
|
283 |
+
iteration = 0
|
284 |
+
|
285 |
+
async with aiohttp.ClientSession() as session:
|
286 |
+
new_search_queries = await generate_search_queries_async(session, user_query)
|
287 |
+
if not new_search_queries:
|
288 |
+
return "No search queries were generated by the LLM. Terminating process."
|
289 |
+
all_search_queries.extend(new_search_queries)
|
290 |
+
|
291 |
+
while iteration < iteration_limit:
|
292 |
+
print(f"\n--- Iteration {iteration + 1} ---")
|
293 |
+
iteration_contexts = []
|
294 |
+
|
295 |
+
search_tasks = [perform_search_async(session, query) for query in new_search_queries]
|
296 |
+
search_results = await asyncio.gather(*search_tasks)
|
297 |
+
|
298 |
+
unique_links = {}
|
299 |
+
for idx, links in enumerate(search_results):
|
300 |
+
query = new_search_queries[idx]
|
301 |
+
for link in links:
|
302 |
+
if link not in unique_links:
|
303 |
+
unique_links[link] = query
|
304 |
+
|
305 |
+
print(f"Collected {len(unique_links)} distinct links in this iteration.")
|
306 |
+
|
307 |
+
link_tasks = [
|
308 |
+
process_link(session, link, user_query, unique_links[link])
|
309 |
+
for link in unique_links
|
310 |
+
]
|
311 |
+
link_results = await asyncio.gather(*link_tasks)
|
312 |
+
|
313 |
+
for res in link_results:
|
314 |
+
if res:
|
315 |
+
iteration_contexts.append(res)
|
316 |
+
|
317 |
+
if iteration_contexts:
|
318 |
+
sourced_contexts.extend(iteration_contexts)
|
319 |
+
else:
|
320 |
+
print("No relevant information was found in this iteration.")
|
321 |
+
|
322 |
+
context_texts = [ctx.text for ctx in sourced_contexts]
|
323 |
+
new_search_queries = await get_new_search_queries_async(
|
324 |
+
session, user_query, all_search_queries, context_texts
|
325 |
+
)
|
326 |
+
|
327 |
+
if new_search_queries == "":
|
328 |
+
print("LLM has determined that additional research is unnecessary.")
|
329 |
+
break
|
330 |
+
elif new_search_queries:
|
331 |
+
print("LLM provided extra search queries:", new_search_queries)
|
332 |
+
all_search_queries.extend(new_search_queries)
|
333 |
+
else:
|
334 |
+
print("LLM returned no further search queries. Concluding the loop.")
|
335 |
+
break
|
336 |
+
|
337 |
+
iteration += 1
|
338 |
+
|
339 |
+
final_report = await generate_final_report_async(session, user_query, sourced_contexts)
|
340 |
+
return final_report
|
341 |
+
|
342 |
+
def main():
|
343 |
+
"""
|
344 |
+
CLI entry point for testing this research module.
|
345 |
+
"""
|
346 |
+
user_query = input("Enter your research topic/question: ").strip()
|
347 |
+
iter_limit_input = input("Enter the maximum number of iterations (default is 10): ").strip()
|
348 |
+
iteration_limit = int(iter_limit_input) if iter_limit_input.isdigit() else 10
|
349 |
+
|
350 |
+
final_report = asyncio.run(research_flow(user_query, iteration_limit))
|
351 |
+
print("\n==== FINAL REPORT ====\n")
|
352 |
+
print(final_report)
|
353 |
+
|
354 |
+
if __name__ == "__main__":
|
355 |
+
main()
|