Spaces:

yuvraj-yadav
/

Math-Domain-Classifier

Running

App Files Files Community

yuvraj-yadav commited on Jul 8

Commit

3c8e99b

verified ·

1 Parent(s): 051c6a0

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -122

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 import arxiv
 from semanticscholar import SemanticScholar
-import requests
-GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
-GOOGLE_CSE_ID = "b2d08ab5820ff465d"
 # Load sentence transformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
-# Math domain definitions
 DOMAINS = {
     "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
     "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
@@ -63,7 +61,6 @@ DOMAINS = {
     "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
 }
-# Core Functions
 domain_names = list(DOMAINS.keys())
 domain_texts = list(DOMAINS.values())
 domain_embeddings = model.encode(domain_texts)
@@ -84,104 +81,28 @@ def fetch_arxiv_refs(query, max_results=5):
         pass
     return refs
-def fetch_google_cse_links(query, max_results=5):
-    url = "https://www.googleapis.com/customsearch/v1"
     links = []
-    all_snippets = []
-    all_items = []
     try:
-        for start in range(1, 100, 100):  # fetch up to 100 results in batches of 100
-            params = {
-                "q": query,
-                "key": GOOGLE_API_KEY,
-                "cx": GOOGLE_CSE_ID,
-                "num": 10,
-                "start": start
-            }
-            res = requests.get(url, params=params)
-            items = res.json().get("items", [])
-            if not items:
-                break
-            for item in items:
-                url = item.get("link", "")
-                excluded_domains = [
-                    "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
-                    "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
-                    "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
-                    "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
-                    "socratic.org", "chegg.com", "quizlet.com"
-                ]
-                if any(domain in url.lower() for domain in excluded_domains):
-                    continue
-                snippet = item.get("snippet", "")
-                all_items.append({
-                    "title": item.get("title", "No Title"),
-                    "url": url,
-                    "snippet": snippet
-                })
-                all_snippets.append(snippet)
-        if not all_snippets:
-            return links
-        snippet_embeddings = model.encode(all_snippets)
-        used = set()
-        for i, emb_i in enumerate(snippet_embeddings):
-            if i in used:
-                continue
-            group = [i]
-            for j in range(i + 1, len(snippet_embeddings)):
-                if j in used:
-                    continue
-                sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
-                if sim > 0.8:
-                    group.append(j)
-                    used.add(j)
-            # Prefer a result with PDF in URL if available
-            chosen = None
-            for idx in group:
-                if ".pdf" in all_items[idx]["url"].lower():
-                    chosen = all_items[idx]
                     break
-            if not chosen:
-                chosen = all_items[group[0]]
-            links.append({
-                "title": chosen["title"],
-                "url": chosen["url"],
-                "snippet": chosen["snippet"],
-                "source": "Google CSE"
-            })
-            used.update(group)
-            if len(links) >= max_results:
-                break
-    except Exception as e:
-        print("Google CSE Error:", e)
     return links
-# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
-from bs4 import BeautifulSoup
-def extract_top_sections_from_url(query, url, top_k=5):
-    try:
-        res = requests.get(url, timeout=6)
-        if res.status_code != 200:
-            return []
-        soup = BeautifulSoup(res.text, 'html.parser')
-        paras = soup.find_all(['p', 'li', 'div'])
-        clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
-        if not clean_paras:
-            return []
-        query_embed = model.encode(query, convert_to_tensor=True)
-        para_embeds = model.encode(clean_paras, convert_to_tensor=True)
-        sims = cosine_similarity([query_embed], para_embeds)[0]
-        top_indices = sims.argsort()[-top_k:][::-1]
-        return [clean_paras[i] for i in top_indices]
-    except Exception as e:
-        print(f"Error extracting from {url}: {e}")
-        return []
-# Output
 def classify_math_question(question):
     q_embed = model.encode([question])
     scores = cosine_similarity(q_embed, domain_embeddings)[0]
@@ -195,15 +116,7 @@ def classify_math_question(question):
     out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
     refs = fetch_arxiv_refs(question, max_results=5)
-    links = fetch_google_cse_links(question, max_results=5)
-    all_links = links
-    # Enrich links with top-5 extracted sections
-    enriched_links = []
-    for link in all_links:
-        top_sections = extract_top_sections_from_url(question, link['url'])
-        link['top_sections'] = top_sections
-        enriched_links.append(link)
     if refs:
         out += "<b>Top Academic References (arXiv):</b><ul>"
@@ -213,29 +126,22 @@ def classify_math_question(question):
     else:
         out += "<i>No academic references found.</i><br>"
-    if enriched_links:
-        out += "<b>Top Web Resources (Google CSE):</b><ul>"
-        for link in enriched_links:
-            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
-            if link['top_sections']:
-                out += "<br><u>Top Extracted Sections:</u><ol>"
-                for sec in link['top_sections']:
-                    out += f"<li>{sec}</li>"
-                out += "</ol>"
-            out += "</li>"
         out += "</ul>"
     else:
         out += "<i>No web links found.</i>"
     return out
-# gradio
 iface = gr.Interface(
     fn=classify_math_question,
     inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
     outputs=gr.HTML(label="Predicted Domains + References"),
-    title="⚡ Math Domain Classifier with arXiv + Google",
-    description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
 )
 iface.launch()

+# app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 import arxiv
 from semanticscholar import SemanticScholar
+from duckduckgo_search import DDGS
 # Load sentence transformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
+# Math domain definitions (trimmed for brevity)
 DOMAINS = {
     "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
     "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
     "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
 }
 domain_names = list(DOMAINS.keys())
 domain_texts = list(DOMAINS.values())
 domain_embeddings = model.encode(domain_texts)
         pass
     return refs
+def fetch_duckduckgo_links(query, max_results=10):
     links = []
     try:
+        with DDGS() as ddgs:
+            results = ddgs.text(query, max_results=max_results)
+            count = 0
+            for res in results:
+                url = res['href']
+                if ".edu" in url or ".org" in url:
+                    links.append({
+                        "title": res['title'],
+                        "url": url,
+                        "snippet": res['body'],
+                        "source": "DuckDuckGo"
+                    })
+                    count += 1
+                if count >= 3:
                     break
+    except:
+        pass
     return links
 def classify_math_question(question):
     q_embed = model.encode([question])
     scores = cosine_similarity(q_embed, domain_embeddings)[0]
     out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
     refs = fetch_arxiv_refs(question, max_results=5)
+    links = fetch_duckduckgo_links(question, max_results=3)
     if refs:
         out += "<b>Top Academic References (arXiv):</b><ul>"
     else:
         out += "<i>No academic references found.</i><br>"
+    if links:
+        out += "<b>Top Web Resources (DuckDuckGo):</b><ul>"
+        for link in links:
+            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a></li>"
         out += "</ul>"
     else:
         out += "<i>No web links found.</i>"
     return out
 iface = gr.Interface(
     fn=classify_math_question,
     inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
     outputs=gr.HTML(label="Predicted Domains + References"),
+    title="⚡ Fast Math Domain Classifier with arXiv + DuckDuckGo",
+    description="Classifies math problems into major/minor domains and fetches fast references from arXiv + DuckDuckGo."
 )
 iface.launch()