Update app.py
Browse files
app.py
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
-
# app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
|
2 |
-
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
import gradio as gr
|
6 |
import arxiv
|
7 |
from semanticscholar import SemanticScholar
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Load sentence transformer
|
11 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
|
13 |
-
# Math domain definitions
|
14 |
DOMAINS = {
|
15 |
"Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
|
16 |
"Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
|
@@ -61,6 +63,7 @@ DOMAINS = {
|
|
61 |
"Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
|
62 |
}
|
63 |
|
|
|
64 |
domain_names = list(DOMAINS.keys())
|
65 |
domain_texts = list(DOMAINS.values())
|
66 |
domain_embeddings = model.encode(domain_texts)
|
@@ -81,28 +84,104 @@ def fetch_arxiv_refs(query, max_results=5):
|
|
81 |
pass
|
82 |
return refs
|
83 |
|
84 |
-
|
|
|
|
|
85 |
links = []
|
|
|
|
|
86 |
try:
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
break
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
return links
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
def classify_math_question(question):
|
107 |
q_embed = model.encode([question])
|
108 |
scores = cosine_similarity(q_embed, domain_embeddings)[0]
|
@@ -116,7 +195,15 @@ def classify_math_question(question):
|
|
116 |
out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
|
117 |
|
118 |
refs = fetch_arxiv_refs(question, max_results=5)
|
119 |
-
links =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
if refs:
|
122 |
out += "<b>Top Academic References (arXiv):</b><ul>"
|
@@ -126,22 +213,29 @@ def classify_math_question(question):
|
|
126 |
else:
|
127 |
out += "<i>No academic references found.</i><br>"
|
128 |
|
129 |
-
if
|
130 |
-
out += "<b>Top Web Resources (
|
131 |
-
for link in
|
132 |
-
out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
out += "</ul>"
|
134 |
else:
|
135 |
out += "<i>No web links found.</i>"
|
136 |
|
137 |
return out
|
138 |
|
|
|
139 |
iface = gr.Interface(
|
140 |
fn=classify_math_question,
|
141 |
inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
|
142 |
outputs=gr.HTML(label="Predicted Domains + References"),
|
143 |
-
title="⚡
|
144 |
-
description="Classifies math problems into major/minor domains and fetches fast references from arXiv
|
145 |
)
|
146 |
|
147 |
iface.launch()
|
|
|
|
|
|
|
1 |
from sentence_transformers import SentenceTransformer
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
import gradio as gr
|
4 |
import arxiv
|
5 |
from semanticscholar import SemanticScholar
|
6 |
+
import requests
|
7 |
+
|
8 |
+
|
9 |
+
GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
|
10 |
+
GOOGLE_CSE_ID = "b2d08ab5820ff465d"
|
11 |
|
12 |
# Load sentence transformer
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
|
15 |
+
# Math domain definitions
|
16 |
DOMAINS = {
|
17 |
"Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
|
18 |
"Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
|
|
|
63 |
"Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
|
64 |
}
|
65 |
|
66 |
+
# Core Functions
|
67 |
domain_names = list(DOMAINS.keys())
|
68 |
domain_texts = list(DOMAINS.values())
|
69 |
domain_embeddings = model.encode(domain_texts)
|
|
|
84 |
pass
|
85 |
return refs
|
86 |
|
87 |
+
|
88 |
+
def fetch_google_cse_links(query, max_results=5):
|
89 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
90 |
links = []
|
91 |
+
all_snippets = []
|
92 |
+
all_items = []
|
93 |
try:
|
94 |
+
for start in range(1, 100, 100): # fetch up to 100 results in batches of 100
|
95 |
+
params = {
|
96 |
+
"q": query,
|
97 |
+
"key": GOOGLE_API_KEY,
|
98 |
+
"cx": GOOGLE_CSE_ID,
|
99 |
+
"num": 10,
|
100 |
+
"start": start
|
101 |
+
}
|
102 |
+
res = requests.get(url, params=params)
|
103 |
+
items = res.json().get("items", [])
|
104 |
+
if not items:
|
105 |
+
break
|
106 |
+
for item in items:
|
107 |
+
url = item.get("link", "")
|
108 |
+
excluded_domains = [
|
109 |
+
"facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
|
110 |
+
"wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
|
111 |
+
"photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
|
112 |
+
"mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
|
113 |
+
"socratic.org", "chegg.com", "quizlet.com"
|
114 |
+
]
|
115 |
+
if any(domain in url.lower() for domain in excluded_domains):
|
116 |
+
continue
|
117 |
+
snippet = item.get("snippet", "")
|
118 |
+
all_items.append({
|
119 |
+
"title": item.get("title", "No Title"),
|
120 |
+
"url": url,
|
121 |
+
"snippet": snippet
|
122 |
+
})
|
123 |
+
all_snippets.append(snippet)
|
124 |
+
|
125 |
+
if not all_snippets:
|
126 |
+
return links
|
127 |
+
|
128 |
+
snippet_embeddings = model.encode(all_snippets)
|
129 |
+
used = set()
|
130 |
+
for i, emb_i in enumerate(snippet_embeddings):
|
131 |
+
if i in used:
|
132 |
+
continue
|
133 |
+
group = [i]
|
134 |
+
for j in range(i + 1, len(snippet_embeddings)):
|
135 |
+
if j in used:
|
136 |
+
continue
|
137 |
+
sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
|
138 |
+
if sim > 0.8:
|
139 |
+
group.append(j)
|
140 |
+
used.add(j)
|
141 |
+
# Prefer a result with PDF in URL if available
|
142 |
+
chosen = None
|
143 |
+
for idx in group:
|
144 |
+
if ".pdf" in all_items[idx]["url"].lower():
|
145 |
+
chosen = all_items[idx]
|
146 |
break
|
147 |
+
if not chosen:
|
148 |
+
chosen = all_items[group[0]]
|
149 |
+
links.append({
|
150 |
+
"title": chosen["title"],
|
151 |
+
"url": chosen["url"],
|
152 |
+
"snippet": chosen["snippet"],
|
153 |
+
"source": "Google CSE"
|
154 |
+
})
|
155 |
+
used.update(group)
|
156 |
+
if len(links) >= max_results:
|
157 |
+
break
|
158 |
+
except Exception as e:
|
159 |
+
print("Google CSE Error:", e)
|
160 |
return links
|
161 |
|
162 |
+
|
163 |
+
# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
|
164 |
+
from bs4 import BeautifulSoup
|
165 |
+
def extract_top_sections_from_url(query, url, top_k=5):
|
166 |
+
try:
|
167 |
+
res = requests.get(url, timeout=6)
|
168 |
+
if res.status_code != 200:
|
169 |
+
return []
|
170 |
+
soup = BeautifulSoup(res.text, 'html.parser')
|
171 |
+
paras = soup.find_all(['p', 'li', 'div'])
|
172 |
+
clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
|
173 |
+
if not clean_paras:
|
174 |
+
return []
|
175 |
+
query_embed = model.encode(query, convert_to_tensor=True)
|
176 |
+
para_embeds = model.encode(clean_paras, convert_to_tensor=True)
|
177 |
+
sims = cosine_similarity([query_embed], para_embeds)[0]
|
178 |
+
top_indices = sims.argsort()[-top_k:][::-1]
|
179 |
+
return [clean_paras[i] for i in top_indices]
|
180 |
+
except Exception as e:
|
181 |
+
print(f"Error extracting from {url}: {e}")
|
182 |
+
return []
|
183 |
+
|
184 |
+
# Output
|
185 |
def classify_math_question(question):
|
186 |
q_embed = model.encode([question])
|
187 |
scores = cosine_similarity(q_embed, domain_embeddings)[0]
|
|
|
195 |
out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
|
196 |
|
197 |
refs = fetch_arxiv_refs(question, max_results=5)
|
198 |
+
links = fetch_google_cse_links(question, max_results=5)
|
199 |
+
all_links = links
|
200 |
+
|
201 |
+
# Enrich links with top-5 extracted sections
|
202 |
+
enriched_links = []
|
203 |
+
for link in all_links:
|
204 |
+
top_sections = extract_top_sections_from_url(question, link['url'])
|
205 |
+
link['top_sections'] = top_sections
|
206 |
+
enriched_links.append(link)
|
207 |
|
208 |
if refs:
|
209 |
out += "<b>Top Academic References (arXiv):</b><ul>"
|
|
|
213 |
else:
|
214 |
out += "<i>No academic references found.</i><br>"
|
215 |
|
216 |
+
if enriched_links:
|
217 |
+
out += "<b>Top Web Resources (Google CSE):</b><ul>"
|
218 |
+
for link in enriched_links:
|
219 |
+
out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
|
220 |
+
if link['top_sections']:
|
221 |
+
out += "<br><u>Top Extracted Sections:</u><ol>"
|
222 |
+
for sec in link['top_sections']:
|
223 |
+
out += f"<li>{sec}</li>"
|
224 |
+
out += "</ol>"
|
225 |
+
out += "</li>"
|
226 |
out += "</ul>"
|
227 |
else:
|
228 |
out += "<i>No web links found.</i>"
|
229 |
|
230 |
return out
|
231 |
|
232 |
+
# gradio
|
233 |
iface = gr.Interface(
|
234 |
fn=classify_math_question,
|
235 |
inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
|
236 |
outputs=gr.HTML(label="Predicted Domains + References"),
|
237 |
+
title="⚡ Math Domain Classifier with arXiv + Google",
|
238 |
+
description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
|
239 |
)
|
240 |
|
241 |
iface.launch()
|