Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,16 @@
|
|
|
|
|
|
1 |
from sentence_transformers import SentenceTransformer
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
import gradio as gr
|
4 |
import arxiv
|
5 |
from semanticscholar import SemanticScholar
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
|
10 |
-
GOOGLE_CSE_ID = "b2d08ab5820ff465d"
|
11 |
|
12 |
# Load sentence transformer
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
|
15 |
-
# Math domain definitions
|
16 |
DOMAINS = {
|
17 |
"Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
|
18 |
"Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
|
@@ -63,7 +61,6 @@ DOMAINS = {
|
|
63 |
"Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
|
64 |
}
|
65 |
|
66 |
-
# Core Functions
|
67 |
domain_names = list(DOMAINS.keys())
|
68 |
domain_texts = list(DOMAINS.values())
|
69 |
domain_embeddings = model.encode(domain_texts)
|
@@ -84,104 +81,28 @@ def fetch_arxiv_refs(query, max_results=5):
|
|
84 |
pass
|
85 |
return refs
|
86 |
|
87 |
-
|
88 |
-
def fetch_google_cse_links(query, max_results=5):
|
89 |
-
url = "https://www.googleapis.com/customsearch/v1"
|
90 |
links = []
|
91 |
-
all_snippets = []
|
92 |
-
all_items = []
|
93 |
try:
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
"
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
excluded_domains = [
|
109 |
-
"facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
|
110 |
-
"wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
|
111 |
-
"photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
|
112 |
-
"mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
|
113 |
-
"socratic.org", "chegg.com", "quizlet.com"
|
114 |
-
]
|
115 |
-
if any(domain in url.lower() for domain in excluded_domains):
|
116 |
-
continue
|
117 |
-
snippet = item.get("snippet", "")
|
118 |
-
all_items.append({
|
119 |
-
"title": item.get("title", "No Title"),
|
120 |
-
"url": url,
|
121 |
-
"snippet": snippet
|
122 |
-
})
|
123 |
-
all_snippets.append(snippet)
|
124 |
-
|
125 |
-
if not all_snippets:
|
126 |
-
return links
|
127 |
-
|
128 |
-
snippet_embeddings = model.encode(all_snippets)
|
129 |
-
used = set()
|
130 |
-
for i, emb_i in enumerate(snippet_embeddings):
|
131 |
-
if i in used:
|
132 |
-
continue
|
133 |
-
group = [i]
|
134 |
-
for j in range(i + 1, len(snippet_embeddings)):
|
135 |
-
if j in used:
|
136 |
-
continue
|
137 |
-
sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
|
138 |
-
if sim > 0.8:
|
139 |
-
group.append(j)
|
140 |
-
used.add(j)
|
141 |
-
# Prefer a result with PDF in URL if available
|
142 |
-
chosen = None
|
143 |
-
for idx in group:
|
144 |
-
if ".pdf" in all_items[idx]["url"].lower():
|
145 |
-
chosen = all_items[idx]
|
146 |
break
|
147 |
-
|
148 |
-
|
149 |
-
links.append({
|
150 |
-
"title": chosen["title"],
|
151 |
-
"url": chosen["url"],
|
152 |
-
"snippet": chosen["snippet"],
|
153 |
-
"source": "Google CSE"
|
154 |
-
})
|
155 |
-
used.update(group)
|
156 |
-
if len(links) >= max_results:
|
157 |
-
break
|
158 |
-
except Exception as e:
|
159 |
-
print("Google CSE Error:", e)
|
160 |
return links
|
161 |
|
162 |
-
|
163 |
-
# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
|
164 |
-
from bs4 import BeautifulSoup
|
165 |
-
def extract_top_sections_from_url(query, url, top_k=5):
|
166 |
-
try:
|
167 |
-
res = requests.get(url, timeout=6)
|
168 |
-
if res.status_code != 200:
|
169 |
-
return []
|
170 |
-
soup = BeautifulSoup(res.text, 'html.parser')
|
171 |
-
paras = soup.find_all(['p', 'li', 'div'])
|
172 |
-
clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
|
173 |
-
if not clean_paras:
|
174 |
-
return []
|
175 |
-
query_embed = model.encode(query, convert_to_tensor=True)
|
176 |
-
para_embeds = model.encode(clean_paras, convert_to_tensor=True)
|
177 |
-
sims = cosine_similarity([query_embed], para_embeds)[0]
|
178 |
-
top_indices = sims.argsort()[-top_k:][::-1]
|
179 |
-
return [clean_paras[i] for i in top_indices]
|
180 |
-
except Exception as e:
|
181 |
-
print(f"Error extracting from {url}: {e}")
|
182 |
-
return []
|
183 |
-
|
184 |
-
# Output
|
185 |
def classify_math_question(question):
|
186 |
q_embed = model.encode([question])
|
187 |
scores = cosine_similarity(q_embed, domain_embeddings)[0]
|
@@ -195,15 +116,7 @@ def classify_math_question(question):
|
|
195 |
out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
|
196 |
|
197 |
refs = fetch_arxiv_refs(question, max_results=5)
|
198 |
-
links =
|
199 |
-
all_links = links
|
200 |
-
|
201 |
-
# Enrich links with top-5 extracted sections
|
202 |
-
enriched_links = []
|
203 |
-
for link in all_links:
|
204 |
-
top_sections = extract_top_sections_from_url(question, link['url'])
|
205 |
-
link['top_sections'] = top_sections
|
206 |
-
enriched_links.append(link)
|
207 |
|
208 |
if refs:
|
209 |
out += "<b>Top Academic References (arXiv):</b><ul>"
|
@@ -213,29 +126,22 @@ def classify_math_question(question):
|
|
213 |
else:
|
214 |
out += "<i>No academic references found.</i><br>"
|
215 |
|
216 |
-
if
|
217 |
-
out += "<b>Top Web Resources (
|
218 |
-
for link in
|
219 |
-
out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
|
220 |
-
if link['top_sections']:
|
221 |
-
out += "<br><u>Top Extracted Sections:</u><ol>"
|
222 |
-
for sec in link['top_sections']:
|
223 |
-
out += f"<li>{sec}</li>"
|
224 |
-
out += "</ol>"
|
225 |
-
out += "</li>"
|
226 |
out += "</ul>"
|
227 |
else:
|
228 |
out += "<i>No web links found.</i>"
|
229 |
|
230 |
return out
|
231 |
|
232 |
-
# gradio
|
233 |
iface = gr.Interface(
|
234 |
fn=classify_math_question,
|
235 |
inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
|
236 |
outputs=gr.HTML(label="Predicted Domains + References"),
|
237 |
-
title="⚡ Math Domain Classifier with arXiv +
|
238 |
-
description="Classifies math problems into major/minor domains and fetches fast references from arXiv
|
239 |
)
|
240 |
|
241 |
iface.launch()
|
|
|
1 |
+
# app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
|
2 |
+
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
import gradio as gr
|
6 |
import arxiv
|
7 |
from semanticscholar import SemanticScholar
|
8 |
+
from duckduckgo_search import DDGS
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Load sentence transformer
|
11 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
|
13 |
+
# Math domain definitions (trimmed for brevity)
|
14 |
DOMAINS = {
|
15 |
"Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
|
16 |
"Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
|
|
|
61 |
"Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
|
62 |
}
|
63 |
|
|
|
64 |
domain_names = list(DOMAINS.keys())
|
65 |
domain_texts = list(DOMAINS.values())
|
66 |
domain_embeddings = model.encode(domain_texts)
|
|
|
81 |
pass
|
82 |
return refs
|
83 |
|
84 |
+
def fetch_duckduckgo_links(query, max_results=10):
|
|
|
|
|
85 |
links = []
|
|
|
|
|
86 |
try:
|
87 |
+
with DDGS() as ddgs:
|
88 |
+
results = ddgs.text(query, max_results=max_results)
|
89 |
+
count = 0
|
90 |
+
for res in results:
|
91 |
+
url = res['href']
|
92 |
+
if ".edu" in url or ".org" in url:
|
93 |
+
links.append({
|
94 |
+
"title": res['title'],
|
95 |
+
"url": url,
|
96 |
+
"snippet": res['body'],
|
97 |
+
"source": "DuckDuckGo"
|
98 |
+
})
|
99 |
+
count += 1
|
100 |
+
if count >= 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
break
|
102 |
+
except:
|
103 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
return links
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
def classify_math_question(question):
|
107 |
q_embed = model.encode([question])
|
108 |
scores = cosine_similarity(q_embed, domain_embeddings)[0]
|
|
|
116 |
out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
|
117 |
|
118 |
refs = fetch_arxiv_refs(question, max_results=5)
|
119 |
+
links = fetch_duckduckgo_links(question, max_results=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
if refs:
|
122 |
out += "<b>Top Academic References (arXiv):</b><ul>"
|
|
|
126 |
else:
|
127 |
out += "<i>No academic references found.</i><br>"
|
128 |
|
129 |
+
if links:
|
130 |
+
out += "<b>Top Web Resources (DuckDuckGo):</b><ul>"
|
131 |
+
for link in links:
|
132 |
+
out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a></li>"
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
out += "</ul>"
|
134 |
else:
|
135 |
out += "<i>No web links found.</i>"
|
136 |
|
137 |
return out
|
138 |
|
|
|
139 |
iface = gr.Interface(
|
140 |
fn=classify_math_question,
|
141 |
inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
|
142 |
outputs=gr.HTML(label="Predicted Domains + References"),
|
143 |
+
title="⚡ Fast Math Domain Classifier with arXiv + DuckDuckGo",
|
144 |
+
description="Classifies math problems into major/minor domains and fetches fast references from arXiv + DuckDuckGo."
|
145 |
)
|
146 |
|
147 |
iface.launch()
|