yuvraj-yadav commited on
Commit
3c8e99b
·
verified ·
1 Parent(s): 051c6a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -122
app.py CHANGED
@@ -1,18 +1,16 @@
 
 
1
  from sentence_transformers import SentenceTransformer
2
  from sklearn.metrics.pairwise import cosine_similarity
3
  import gradio as gr
4
  import arxiv
5
  from semanticscholar import SemanticScholar
6
- import requests
7
-
8
-
9
- GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
10
- GOOGLE_CSE_ID = "b2d08ab5820ff465d"
11
 
12
  # Load sentence transformer
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
- # Math domain definitions
16
  DOMAINS = {
17
  "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
18
  "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
@@ -63,7 +61,6 @@ DOMAINS = {
63
  "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
64
  }
65
 
66
- # Core Functions
67
  domain_names = list(DOMAINS.keys())
68
  domain_texts = list(DOMAINS.values())
69
  domain_embeddings = model.encode(domain_texts)
@@ -84,104 +81,28 @@ def fetch_arxiv_refs(query, max_results=5):
84
  pass
85
  return refs
86
 
87
-
88
- def fetch_google_cse_links(query, max_results=5):
89
- url = "https://www.googleapis.com/customsearch/v1"
90
  links = []
91
- all_snippets = []
92
- all_items = []
93
  try:
94
- for start in range(1, 100, 100): # fetch up to 100 results in batches of 100
95
- params = {
96
- "q": query,
97
- "key": GOOGLE_API_KEY,
98
- "cx": GOOGLE_CSE_ID,
99
- "num": 10,
100
- "start": start
101
- }
102
- res = requests.get(url, params=params)
103
- items = res.json().get("items", [])
104
- if not items:
105
- break
106
- for item in items:
107
- url = item.get("link", "")
108
- excluded_domains = [
109
- "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
110
- "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
111
- "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
112
- "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
113
- "socratic.org", "chegg.com", "quizlet.com"
114
- ]
115
- if any(domain in url.lower() for domain in excluded_domains):
116
- continue
117
- snippet = item.get("snippet", "")
118
- all_items.append({
119
- "title": item.get("title", "No Title"),
120
- "url": url,
121
- "snippet": snippet
122
- })
123
- all_snippets.append(snippet)
124
-
125
- if not all_snippets:
126
- return links
127
-
128
- snippet_embeddings = model.encode(all_snippets)
129
- used = set()
130
- for i, emb_i in enumerate(snippet_embeddings):
131
- if i in used:
132
- continue
133
- group = [i]
134
- for j in range(i + 1, len(snippet_embeddings)):
135
- if j in used:
136
- continue
137
- sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
138
- if sim > 0.8:
139
- group.append(j)
140
- used.add(j)
141
- # Prefer a result with PDF in URL if available
142
- chosen = None
143
- for idx in group:
144
- if ".pdf" in all_items[idx]["url"].lower():
145
- chosen = all_items[idx]
146
  break
147
- if not chosen:
148
- chosen = all_items[group[0]]
149
- links.append({
150
- "title": chosen["title"],
151
- "url": chosen["url"],
152
- "snippet": chosen["snippet"],
153
- "source": "Google CSE"
154
- })
155
- used.update(group)
156
- if len(links) >= max_results:
157
- break
158
- except Exception as e:
159
- print("Google CSE Error:", e)
160
  return links
161
 
162
-
163
- # Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
164
- from bs4 import BeautifulSoup
165
- def extract_top_sections_from_url(query, url, top_k=5):
166
- try:
167
- res = requests.get(url, timeout=6)
168
- if res.status_code != 200:
169
- return []
170
- soup = BeautifulSoup(res.text, 'html.parser')
171
- paras = soup.find_all(['p', 'li', 'div'])
172
- clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
173
- if not clean_paras:
174
- return []
175
- query_embed = model.encode(query, convert_to_tensor=True)
176
- para_embeds = model.encode(clean_paras, convert_to_tensor=True)
177
- sims = cosine_similarity([query_embed], para_embeds)[0]
178
- top_indices = sims.argsort()[-top_k:][::-1]
179
- return [clean_paras[i] for i in top_indices]
180
- except Exception as e:
181
- print(f"Error extracting from {url}: {e}")
182
- return []
183
-
184
- # Output
185
  def classify_math_question(question):
186
  q_embed = model.encode([question])
187
  scores = cosine_similarity(q_embed, domain_embeddings)[0]
@@ -195,15 +116,7 @@ def classify_math_question(question):
195
  out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
196
 
197
  refs = fetch_arxiv_refs(question, max_results=5)
198
- links = fetch_google_cse_links(question, max_results=5)
199
- all_links = links
200
-
201
- # Enrich links with top-5 extracted sections
202
- enriched_links = []
203
- for link in all_links:
204
- top_sections = extract_top_sections_from_url(question, link['url'])
205
- link['top_sections'] = top_sections
206
- enriched_links.append(link)
207
 
208
  if refs:
209
  out += "<b>Top Academic References (arXiv):</b><ul>"
@@ -213,29 +126,22 @@ def classify_math_question(question):
213
  else:
214
  out += "<i>No academic references found.</i><br>"
215
 
216
- if enriched_links:
217
- out += "<b>Top Web Resources (Google CSE):</b><ul>"
218
- for link in enriched_links:
219
- out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
220
- if link['top_sections']:
221
- out += "<br><u>Top Extracted Sections:</u><ol>"
222
- for sec in link['top_sections']:
223
- out += f"<li>{sec}</li>"
224
- out += "</ol>"
225
- out += "</li>"
226
  out += "</ul>"
227
  else:
228
  out += "<i>No web links found.</i>"
229
 
230
  return out
231
 
232
- # gradio
233
  iface = gr.Interface(
234
  fn=classify_math_question,
235
  inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
236
  outputs=gr.HTML(label="Predicted Domains + References"),
237
- title="⚡ Math Domain Classifier with arXiv + Google",
238
- description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
239
  )
240
 
241
  iface.launch()
 
1
+ # app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
2
+
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import gradio as gr
6
  import arxiv
7
  from semanticscholar import SemanticScholar
8
+ from duckduckgo_search import DDGS
 
 
 
 
9
 
10
  # Load sentence transformer
11
  model = SentenceTransformer('all-MiniLM-L6-v2')
12
 
13
+ # Math domain definitions (trimmed for brevity)
14
  DOMAINS = {
15
  "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
16
  "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
 
61
  "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
62
  }
63
 
 
64
  domain_names = list(DOMAINS.keys())
65
  domain_texts = list(DOMAINS.values())
66
  domain_embeddings = model.encode(domain_texts)
 
81
  pass
82
  return refs
83
 
84
+ def fetch_duckduckgo_links(query, max_results=10):
 
 
85
  links = []
 
 
86
  try:
87
+ with DDGS() as ddgs:
88
+ results = ddgs.text(query, max_results=max_results)
89
+ count = 0
90
+ for res in results:
91
+ url = res['href']
92
+ if ".edu" in url or ".org" in url:
93
+ links.append({
94
+ "title": res['title'],
95
+ "url": url,
96
+ "snippet": res['body'],
97
+ "source": "DuckDuckGo"
98
+ })
99
+ count += 1
100
+ if count >= 3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  break
102
+ except:
103
+ pass
 
 
 
 
 
 
 
 
 
 
 
104
  return links
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def classify_math_question(question):
107
  q_embed = model.encode([question])
108
  scores = cosine_similarity(q_embed, domain_embeddings)[0]
 
116
  out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
117
 
118
  refs = fetch_arxiv_refs(question, max_results=5)
119
+ links = fetch_duckduckgo_links(question, max_results=3)
 
 
 
 
 
 
 
 
120
 
121
  if refs:
122
  out += "<b>Top Academic References (arXiv):</b><ul>"
 
126
  else:
127
  out += "<i>No academic references found.</i><br>"
128
 
129
+ if links:
130
+ out += "<b>Top Web Resources (DuckDuckGo):</b><ul>"
131
+ for link in links:
132
+ out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a></li>"
 
 
 
 
 
 
133
  out += "</ul>"
134
  else:
135
  out += "<i>No web links found.</i>"
136
 
137
  return out
138
 
 
139
  iface = gr.Interface(
140
  fn=classify_math_question,
141
  inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
142
  outputs=gr.HTML(label="Predicted Domains + References"),
143
+ title="⚡ Fast Math Domain Classifier with arXiv + DuckDuckGo",
144
+ description="Classifies math problems into major/minor domains and fetches fast references from arXiv + DuckDuckGo."
145
  )
146
 
147
  iface.launch()