yuvraj-yadav commited on
Commit
051c6a0
·
verified ·
1 Parent(s): de77ff5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -28
app.py CHANGED
@@ -1,16 +1,18 @@
1
- # app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
2
-
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import gradio as gr
6
  import arxiv
7
  from semanticscholar import SemanticScholar
8
- from duckduckgo_search import DDGS
 
 
 
 
9
 
10
  # Load sentence transformer
11
  model = SentenceTransformer('all-MiniLM-L6-v2')
12
 
13
- # Math domain definitions (trimmed for brevity)
14
  DOMAINS = {
15
  "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
16
  "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
@@ -61,6 +63,7 @@ DOMAINS = {
61
  "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
62
  }
63
 
 
64
  domain_names = list(DOMAINS.keys())
65
  domain_texts = list(DOMAINS.values())
66
  domain_embeddings = model.encode(domain_texts)
@@ -81,28 +84,104 @@ def fetch_arxiv_refs(query, max_results=5):
81
  pass
82
  return refs
83
 
84
- def fetch_duckduckgo_links(query, max_results=10):
 
 
85
  links = []
 
 
86
  try:
87
- with DDGS() as ddgs:
88
- results = ddgs.text(query, max_results=max_results)
89
- count = 0
90
- for res in results:
91
- url = res['href']
92
- if ".edu" in url or ".org" in url:
93
- links.append({
94
- "title": res['title'],
95
- "url": url,
96
- "snippet": res['body'],
97
- "source": "DuckDuckGo"
98
- })
99
- count += 1
100
- if count >= 3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  break
102
- except:
103
- pass
 
 
 
 
 
 
 
 
 
 
 
104
  return links
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def classify_math_question(question):
107
  q_embed = model.encode([question])
108
  scores = cosine_similarity(q_embed, domain_embeddings)[0]
@@ -116,7 +195,15 @@ def classify_math_question(question):
116
  out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
117
 
118
  refs = fetch_arxiv_refs(question, max_results=5)
119
- links = fetch_duckduckgo_links(question, max_results=3)
 
 
 
 
 
 
 
 
120
 
121
  if refs:
122
  out += "<b>Top Academic References (arXiv):</b><ul>"
@@ -126,22 +213,29 @@ def classify_math_question(question):
126
  else:
127
  out += "<i>No academic references found.</i><br>"
128
 
129
- if links:
130
- out += "<b>Top Web Resources (DuckDuckGo):</b><ul>"
131
- for link in links:
132
- out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a></li>"
 
 
 
 
 
 
133
  out += "</ul>"
134
  else:
135
  out += "<i>No web links found.</i>"
136
 
137
  return out
138
 
 
139
  iface = gr.Interface(
140
  fn=classify_math_question,
141
  inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
142
  outputs=gr.HTML(label="Predicted Domains + References"),
143
- title="⚡ Fast Math Domain Classifier with arXiv + DuckDuckGo",
144
- description="Classifies math problems into major/minor domains and fetches fast references from arXiv + DuckDuckGo."
145
  )
146
 
147
  iface.launch()
 
 
 
1
  from sentence_transformers import SentenceTransformer
2
  from sklearn.metrics.pairwise import cosine_similarity
3
  import gradio as gr
4
  import arxiv
5
  from semanticscholar import SemanticScholar
6
+ import requests
7
+
8
+
9
+ GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
10
+ GOOGLE_CSE_ID = "b2d08ab5820ff465d"
11
 
12
  # Load sentence transformer
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
+ # Math domain definitions
16
  DOMAINS = {
17
  "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
18
  "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
 
63
  "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
64
  }
65
 
66
+ # Core Functions
67
  domain_names = list(DOMAINS.keys())
68
  domain_texts = list(DOMAINS.values())
69
  domain_embeddings = model.encode(domain_texts)
 
84
  pass
85
  return refs
86
 
87
+
88
+ def fetch_google_cse_links(query, max_results=5):
89
+ url = "https://www.googleapis.com/customsearch/v1"
90
  links = []
91
+ all_snippets = []
92
+ all_items = []
93
  try:
94
+ for start in range(1, 100, 100): # fetch up to 100 results in batches of 100
95
+ params = {
96
+ "q": query,
97
+ "key": GOOGLE_API_KEY,
98
+ "cx": GOOGLE_CSE_ID,
99
+ "num": 10,
100
+ "start": start
101
+ }
102
+ res = requests.get(url, params=params)
103
+ items = res.json().get("items", [])
104
+ if not items:
105
+ break
106
+ for item in items:
107
+ url = item.get("link", "")
108
+ excluded_domains = [
109
+ "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
110
+ "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
111
+ "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
112
+ "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
113
+ "socratic.org", "chegg.com", "quizlet.com"
114
+ ]
115
+ if any(domain in url.lower() for domain in excluded_domains):
116
+ continue
117
+ snippet = item.get("snippet", "")
118
+ all_items.append({
119
+ "title": item.get("title", "No Title"),
120
+ "url": url,
121
+ "snippet": snippet
122
+ })
123
+ all_snippets.append(snippet)
124
+
125
+ if not all_snippets:
126
+ return links
127
+
128
+ snippet_embeddings = model.encode(all_snippets)
129
+ used = set()
130
+ for i, emb_i in enumerate(snippet_embeddings):
131
+ if i in used:
132
+ continue
133
+ group = [i]
134
+ for j in range(i + 1, len(snippet_embeddings)):
135
+ if j in used:
136
+ continue
137
+ sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
138
+ if sim > 0.8:
139
+ group.append(j)
140
+ used.add(j)
141
+ # Prefer a result with PDF in URL if available
142
+ chosen = None
143
+ for idx in group:
144
+ if ".pdf" in all_items[idx]["url"].lower():
145
+ chosen = all_items[idx]
146
  break
147
+ if not chosen:
148
+ chosen = all_items[group[0]]
149
+ links.append({
150
+ "title": chosen["title"],
151
+ "url": chosen["url"],
152
+ "snippet": chosen["snippet"],
153
+ "source": "Google CSE"
154
+ })
155
+ used.update(group)
156
+ if len(links) >= max_results:
157
+ break
158
+ except Exception as e:
159
+ print("Google CSE Error:", e)
160
  return links
161
 
162
+
163
+ # Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
164
+ from bs4 import BeautifulSoup
165
+ def extract_top_sections_from_url(query, url, top_k=5):
166
+ try:
167
+ res = requests.get(url, timeout=6)
168
+ if res.status_code != 200:
169
+ return []
170
+ soup = BeautifulSoup(res.text, 'html.parser')
171
+ paras = soup.find_all(['p', 'li', 'div'])
172
+ clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
173
+ if not clean_paras:
174
+ return []
175
+ query_embed = model.encode(query, convert_to_tensor=True)
176
+ para_embeds = model.encode(clean_paras, convert_to_tensor=True)
177
+ sims = cosine_similarity([query_embed], para_embeds)[0]
178
+ top_indices = sims.argsort()[-top_k:][::-1]
179
+ return [clean_paras[i] for i in top_indices]
180
+ except Exception as e:
181
+ print(f"Error extracting from {url}: {e}")
182
+ return []
183
+
184
+ # Output
185
  def classify_math_question(question):
186
  q_embed = model.encode([question])
187
  scores = cosine_similarity(q_embed, domain_embeddings)[0]
 
195
  out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
196
 
197
  refs = fetch_arxiv_refs(question, max_results=5)
198
+ links = fetch_google_cse_links(question, max_results=5)
199
+ all_links = links
200
+
201
+ # Enrich links with top-5 extracted sections
202
+ enriched_links = []
203
+ for link in all_links:
204
+ top_sections = extract_top_sections_from_url(question, link['url'])
205
+ link['top_sections'] = top_sections
206
+ enriched_links.append(link)
207
 
208
  if refs:
209
  out += "<b>Top Academic References (arXiv):</b><ul>"
 
213
  else:
214
  out += "<i>No academic references found.</i><br>"
215
 
216
+ if enriched_links:
217
+ out += "<b>Top Web Resources (Google CSE):</b><ul>"
218
+ for link in enriched_links:
219
+ out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
220
+ if link['top_sections']:
221
+ out += "<br><u>Top Extracted Sections:</u><ol>"
222
+ for sec in link['top_sections']:
223
+ out += f"<li>{sec}</li>"
224
+ out += "</ol>"
225
+ out += "</li>"
226
  out += "</ul>"
227
  else:
228
  out += "<i>No web links found.</i>"
229
 
230
  return out
231
 
232
+ # gradio
233
  iface = gr.Interface(
234
  fn=classify_math_question,
235
  inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
236
  outputs=gr.HTML(label="Predicted Domains + References"),
237
+ title="⚡ Math Domain Classifier with arXiv + Google",
238
+ description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
239
  )
240
 
241
  iface.launch()