Spaces:

yuvraj-yadav
/

Math-Domain-Classifier

Running

App Files Files Community

yuvraj-yadav commited on Jul 8

Commit

051c6a0

verified ·

1 Parent(s): de77ff5

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -28

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
-# app.py – Now includes DuckDuckGo, arXiv, and Semantic Scholar crawling
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 import arxiv
 from semanticscholar import SemanticScholar
-from duckduckgo_search import DDGS
 # Load sentence transformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
-# Math domain definitions (trimmed for brevity)
 DOMAINS = {
     "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
     "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
@@ -61,6 +63,7 @@ DOMAINS = {
     "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
 }
 domain_names = list(DOMAINS.keys())
 domain_texts = list(DOMAINS.values())
 domain_embeddings = model.encode(domain_texts)
@@ -81,28 +84,104 @@ def fetch_arxiv_refs(query, max_results=5):
         pass
     return refs
-def fetch_duckduckgo_links(query, max_results=10):
     links = []
     try:
-        with DDGS() as ddgs:
-            results = ddgs.text(query, max_results=max_results)
-            count = 0
-            for res in results:
-                url = res['href']
-                if ".edu" in url or ".org" in url:
-                    links.append({
-                        "title": res['title'],
-                        "url": url,
-                        "snippet": res['body'],
-                        "source": "DuckDuckGo"
-                    })
-                    count += 1
-                if count >= 3:
                     break
-    except:
-        pass
     return links
 def classify_math_question(question):
     q_embed = model.encode([question])
     scores = cosine_similarity(q_embed, domain_embeddings)[0]
@@ -116,7 +195,15 @@ def classify_math_question(question):
     out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
     refs = fetch_arxiv_refs(question, max_results=5)
-    links = fetch_duckduckgo_links(question, max_results=3)
     if refs:
         out += "<b>Top Academic References (arXiv):</b><ul>"
@@ -126,22 +213,29 @@ def classify_math_question(question):
     else:
         out += "<i>No academic references found.</i><br>"
-    if links:
-        out += "<b>Top Web Resources (DuckDuckGo):</b><ul>"
-        for link in links:
-            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a></li>"
         out += "</ul>"
     else:
         out += "<i>No web links found.</i>"
     return out
 iface = gr.Interface(
     fn=classify_math_question,
     inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
     outputs=gr.HTML(label="Predicted Domains + References"),
-    title="⚡ Fast Math Domain Classifier with arXiv + DuckDuckGo",
-    description="Classifies math problems into major/minor domains and fetches fast references from arXiv + DuckDuckGo."
 )
 iface.launch()

 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 import arxiv
 from semanticscholar import SemanticScholar
+import requests
+GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
+GOOGLE_CSE_ID = "b2d08ab5820ff465d"
 # Load sentence transformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
+# Math domain definitions
 DOMAINS = {
     "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
     "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
     "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
 }
+# Core Functions
 domain_names = list(DOMAINS.keys())
 domain_texts = list(DOMAINS.values())
 domain_embeddings = model.encode(domain_texts)
         pass
     return refs
+def fetch_google_cse_links(query, max_results=5):
+    url = "https://www.googleapis.com/customsearch/v1"
     links = []
+    all_snippets = []
+    all_items = []
     try:
+        for start in range(1, 100, 100):  # fetch up to 100 results in batches of 100
+            params = {
+                "q": query,
+                "key": GOOGLE_API_KEY,
+                "cx": GOOGLE_CSE_ID,
+                "num": 10,
+                "start": start
+            }
+            res = requests.get(url, params=params)
+            items = res.json().get("items", [])
+            if not items:
+                break
+            for item in items:
+                url = item.get("link", "")
+                excluded_domains = [
+                    "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
+                    "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
+                    "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
+                    "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
+                    "socratic.org", "chegg.com", "quizlet.com"
+                ]
+                if any(domain in url.lower() for domain in excluded_domains):
+                    continue
+                snippet = item.get("snippet", "")
+                all_items.append({
+                    "title": item.get("title", "No Title"),
+                    "url": url,
+                    "snippet": snippet
+                })
+                all_snippets.append(snippet)
+        if not all_snippets:
+            return links
+        snippet_embeddings = model.encode(all_snippets)
+        used = set()
+        for i, emb_i in enumerate(snippet_embeddings):
+            if i in used:
+                continue
+            group = [i]
+            for j in range(i + 1, len(snippet_embeddings)):
+                if j in used:
+                    continue
+                sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
+                if sim > 0.8:
+                    group.append(j)
+                    used.add(j)
+            # Prefer a result with PDF in URL if available
+            chosen = None
+            for idx in group:
+                if ".pdf" in all_items[idx]["url"].lower():
+                    chosen = all_items[idx]
                     break
+            if not chosen:
+                chosen = all_items[group[0]]
+            links.append({
+                "title": chosen["title"],
+                "url": chosen["url"],
+                "snippet": chosen["snippet"],
+                "source": "Google CSE"
+            })
+            used.update(group)
+            if len(links) >= max_results:
+                break
+    except Exception as e:
+        print("Google CSE Error:", e)
     return links
+# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
+from bs4 import BeautifulSoup
+def extract_top_sections_from_url(query, url, top_k=5):
+    try:
+        res = requests.get(url, timeout=6)
+        if res.status_code != 200:
+            return []
+        soup = BeautifulSoup(res.text, 'html.parser')
+        paras = soup.find_all(['p', 'li', 'div'])
+        clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
+        if not clean_paras:
+            return []
+        query_embed = model.encode(query, convert_to_tensor=True)
+        para_embeds = model.encode(clean_paras, convert_to_tensor=True)
+        sims = cosine_similarity([query_embed], para_embeds)[0]
+        top_indices = sims.argsort()[-top_k:][::-1]
+        return [clean_paras[i] for i in top_indices]
+    except Exception as e:
+        print(f"Error extracting from {url}: {e}")
+        return []
+# Output
 def classify_math_question(question):
     q_embed = model.encode([question])
     scores = cosine_similarity(q_embed, domain_embeddings)[0]
     out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"
     refs = fetch_arxiv_refs(question, max_results=5)
+    links = fetch_google_cse_links(question, max_results=5)
+    all_links = links
+    # Enrich links with top-5 extracted sections
+    enriched_links = []
+    for link in all_links:
+        top_sections = extract_top_sections_from_url(question, link['url'])
+        link['top_sections'] = top_sections
+        enriched_links.append(link)
     if refs:
         out += "<b>Top Academic References (arXiv):</b><ul>"
     else:
         out += "<i>No academic references found.</i><br>"
+    if enriched_links:
+        out += "<b>Top Web Resources (Google CSE):</b><ul>"
+        for link in enriched_links:
+            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
+            if link['top_sections']:
+                out += "<br><u>Top Extracted Sections:</u><ol>"
+                for sec in link['top_sections']:
+                    out += f"<li>{sec}</li>"
+                out += "</ol>"
+            out += "</li>"
         out += "</ul>"
     else:
         out += "<i>No web links found.</i>"
     return out
+# gradio
 iface = gr.Interface(
     fn=classify_math_question,
     inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
     outputs=gr.HTML(label="Predicted Domains + References"),
+    title="⚡ Math Domain Classifier with arXiv + Google",
+    description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
 )
 iface.launch()