File size: 13,755 Bytes
c02e09b
 
 
125237a
 
051c6a0
 
 
 
 
c02e09b
125237a
 
c02e09b
051c6a0
c02e09b
 
f3afefa
c02e09b
f3afefa
 
 
 
 
125237a
f3afefa
 
 
 
 
 
 
 
 
125237a
f3afefa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c02e09b
 
051c6a0
c02e09b
 
 
 
b8db557
026dc36
125237a
026dc36
125237a
026dc36
125237a
 
 
 
 
 
 
 
026dc36
125237a
051c6a0
 
 
125237a
051c6a0
 
125237a
051c6a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b221737
051c6a0
 
 
 
 
 
 
 
 
 
 
 
 
125237a
 
051c6a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c02e09b
 
 
 
 
 
 
 
125237a
 
 
 
de77ff5
051c6a0
 
 
 
 
 
 
 
 
125237a
 
026dc36
125237a
026dc36
125237a
 
 
 
051c6a0
 
 
 
 
 
 
 
 
 
125237a
 
 
 
 
c02e09b
051c6a0
c02e09b
 
125237a
 
051c6a0
 
c02e09b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import arxiv
from semanticscholar import SemanticScholar
import requests


GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
GOOGLE_CSE_ID = "b2d08ab5820ff465d"

# Load sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Math domain definitions
DOMAINS = {
    "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
    "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
    "Functional Analysis": "Deals with infinite-dimensional vector spaces, Banach and Hilbert spaces, linear operators, duality, and spectral theory in the context of functional spaces.",
    "Measure Theory": "Studies sigma-algebras, measures, measurable functions, and integrals, forming the foundation for modern probability and real analysis.",
    "Fourier and Harmonic Analysis": "Analyzes functions via decompositions into sines, cosines, or general orthogonal bases, often involving Fourier series, Fourier transforms, and convolution techniques.",
    "Calculus of Variations": "Optimizes functionals over infinite-dimensional spaces, leading to Euler-Lagrange equations and applications in physics and control theory.",
    "Metric Geometry": "Explores geometric properties of metric spaces and the behavior of functions and sequences under various notions of distance.",
    "Ordinary Differential Equations (ODEs)": "Involves differential equations with functions of a single variable, their qualitative behavior, existence, uniqueness, and methods of solving them.",
    "Partial Differential Equations (PDEs)": "Deals with multivariable functions involving partial derivatives, including wave, heat, and Laplace equations.",
    "Dynamical Systems": "Studies evolution of systems over time using discrete or continuous-time equations, stability theory, phase portraits, and attractors.",
    "Linear Algebra": "Focuses on vector spaces, linear transformations, eigenvalues, diagonalization, and matrices.",
    "Abstract Algebra": "General study of algebraic structures such as groups, rings, fields, and modules.",
    "Group Theory": "Investigates algebraic structures with a single binary operation satisfying group axioms, including symmetry groups and applications.",
    "Ring and Module Theory": "Extends group theory to rings (two operations) and modules (generalized vector spaces).",
    "Field Theory": "Studies field extensions, algebraic and transcendental elements, and classical constructions.",
    "Galois Theory": "Connects field theory and group theory to solve polynomial equations and understand solvability.",
    "Algebraic Number Theory": "Applies tools from abstract algebra to study integers, Diophantine equations, and number fields.",
    "Representation Theory": "Studies abstract algebraic structures by representing their elements as linear transformations of vector spaces.",
    "Algebraic Geometry": "Examines solutions to polynomial equations using geometric and algebraic techniques like varieties, schemes, and morphisms.",
    "Differential Geometry": "Studies geometric structures on smooth manifolds, curvature, geodesics, and applications in general relativity.",
    "Topology": "Analyzes qualitative spatial properties preserved under continuous deformations, including homeomorphism, compactness, and connectedness.",
    "Geometric Topology": "Explores topological manifolds and their classification, knot theory, and low-dimensional topology.",
    "Symplectic Geometry": "Studies geometry arising from Hamiltonian systems and phase space, central to classical mechanics.",
    "Combinatorics": "Covers enumeration, existence, construction, and optimization of discrete structures.",
    "Graph Theory": "Deals with the study of graphs, networks, trees, connectivity, and coloring problems.",
    "Discrete Geometry": "Focuses on geometric objects and combinatorial properties in finite settings, such as polytopes and tilings.",
    "Set Theory": "Studies sets, cardinality, ordinals, ZFC axioms, and independence results.",
    "Mathematical Logic": "Includes propositional logic, predicate logic, proof theory, model theory, and recursion theory.",
    "Category Theory": "Provides a high-level, structural framework to relate different mathematical systems through morphisms and objects.",
    "Probability Theory": "Mathematical foundation for randomness, including random variables, distributions, expectation, and stochastic processes.",
    "Mathematical Statistics": "Theory behind estimation, hypothesis testing, confidence intervals, and likelihood inference.",
    "Stochastic Processes": "Studies processes that evolve with randomness over time, like Markov chains and Brownian motion.",
    "Information Theory": "Analyzes data transmission, entropy, coding theory, and information content in probabilistic settings.",
    "Numerical Analysis": "Designs and analyzes algorithms to approximate solutions of mathematical problems including root-finding, integration, and differential equations.",
    "Optimization": "Studies finding best outcomes under constraints, including convex optimization, linear programming, and integer programming.",
    "Operations Research": "Applies optimization, simulation, and probabilistic modeling to decision-making problems in logistics, finance, and industry.",
    "Control Theory": "Mathematically models and regulates dynamic systems through feedback and optimal control strategies.",
    "Computational Mathematics": "Applies algorithmic and numerical techniques to solve mathematical problems on computers.",
    "Game Theory": "Analyzes strategic interaction among rational agents using payoff matrices and equilibrium concepts.",
    "Machine Learning Theory": "Explores the mathematical foundation of algorithms that learn from data, covering generalization, VC dimension, and convergence.",
    "Spectral Theory": "Studies the spectrum (eigenvalues) of linear operators, primarily in Hilbert/Banach spaces, relevant to quantum mechanics and PDEs.",
    "Operator Theory": "Focuses on properties of linear operators on function spaces and their classification.",
    "Mathematical Physics": "Uses advanced mathematical tools to solve and model problems in physics, often involving differential geometry and functional analysis.",
    "Financial Mathematics": "Applies stochastic calculus and optimization to problems in pricing, risk, and investment.",
    "Mathematics Education": "Focuses on teaching methods, learning theories, and curriculum design in mathematics.",
    "History of Mathematics": "Studies the historical development of mathematical concepts, theorems, and personalities.",
    "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
}

# Core Functions
domain_names = list(DOMAINS.keys())
domain_texts = list(DOMAINS.values())
domain_embeddings = model.encode(domain_texts)

def fetch_arxiv_refs(query, max_results=5):
    refs = []
    try:
        search = arxiv.Search(query=query, max_results=max_results)
        for r in search.results():
            refs.append({
                "title": r.title,
                "authors": ", ".join(a.name for a in r.authors[:3]),
                "year": r.published.year,
                "url": r.entry_id,
                "source": "arXiv"
            })
    except:
        pass
    return refs


def fetch_google_cse_links(query, max_results=5):
    url = "https://www.googleapis.com/customsearch/v1"
    links = []
    all_snippets = []
    all_items = []
    try:
        for start in range(1, 100, 100):  # fetch up to 100 results in batches of 100
            params = {
                "q": query,
                "key": GOOGLE_API_KEY,
                "cx": GOOGLE_CSE_ID,
                "num": 10,
                "start": start
            }
            res = requests.get(url, params=params)
            items = res.json().get("items", [])
            if not items:
                break
            for item in items:
                url = item.get("link", "")
                excluded_domains = [
                    "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
                    "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
                    "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
                    "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
                    "socratic.org", "chegg.com", "quizlet.com"
                ]
                if any(domain in url.lower() for domain in excluded_domains):
                    continue
                snippet = item.get("snippet", "")
                all_items.append({
                    "title": item.get("title", "No Title"),
                    "url": url,
                    "snippet": snippet
                })
                all_snippets.append(snippet)

        if not all_snippets:
            return links

        snippet_embeddings = model.encode(all_snippets)
        used = set()
        for i, emb_i in enumerate(snippet_embeddings):
            if i in used:
                continue
            group = [i]
            for j in range(i + 1, len(snippet_embeddings)):
                if j in used:
                    continue
                sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
                if sim > 0.8:
                    group.append(j)
                    used.add(j)
            # Prefer a result with PDF in URL if available
            chosen = None
            for idx in group:
                if ".pdf" in all_items[idx]["url"].lower():
                    chosen = all_items[idx]
                    break
            if not chosen:
                chosen = all_items[group[0]]
            links.append({
                "title": chosen["title"],
                "url": chosen["url"],
                "snippet": chosen["snippet"],
                "source": "Google CSE"
            })
            used.update(group)
            if len(links) >= max_results:
                break
    except Exception as e:
        print("Google CSE Error:", e)
    return links


# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
from bs4 import BeautifulSoup
def extract_top_sections_from_url(query, url, top_k=5):
    try:
        res = requests.get(url, timeout=6)
        if res.status_code != 200:
            return []
        soup = BeautifulSoup(res.text, 'html.parser')
        paras = soup.find_all(['p', 'li', 'div'])
        clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
        if not clean_paras:
            return []
        query_embed = model.encode(query, convert_to_tensor=True)
        para_embeds = model.encode(clean_paras, convert_to_tensor=True)
        sims = cosine_similarity([query_embed], para_embeds)[0]
        top_indices = sims.argsort()[-top_k:][::-1]
        return [clean_paras[i] for i in top_indices]
    except Exception as e:
        print(f"Error extracting from {url}: {e}")
        return []

# Output
def classify_math_question(question):
    q_embed = model.encode([question])
    scores = cosine_similarity(q_embed, domain_embeddings)[0]
    sorted_indices = scores.argsort()[::-1]
    major = domain_names[sorted_indices[0]]
    minor = domain_names[sorted_indices[1]]
    major_reason = DOMAINS[major]
    minor_reason = DOMAINS[minor]

    out = f"<b>Major Domain:</b> {major}<br><i>Reason:</i> {major_reason}<br><br>"
    out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"

    refs = fetch_arxiv_refs(question, max_results=5)
    links = fetch_google_cse_links(question, max_results=5)
    all_links = links

    # Enrich links with top-5 extracted sections
    enriched_links = []
    for link in all_links:
        top_sections = extract_top_sections_from_url(question, link['url'])
        link['top_sections'] = top_sections
        enriched_links.append(link)

    if refs:
        out += "<b>Top Academic References (arXiv):</b><ul>"
        for p in refs:
            out += f"<li><b>{p['title']}</b> ({p['year']}) - <i>{p['authors']}</i><br><a href='{p['url']}' target='_blank'>{p['url']}</a></li>"
        out += "</ul>"
    else:
        out += "<i>No academic references found.</i><br>"

    if enriched_links:
        out += "<b>Top Web Resources (Google CSE):</b><ul>"
        for link in enriched_links:
            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
            if link['top_sections']:
                out += "<br><u>Top Extracted Sections:</u><ol>"
                for sec in link['top_sections']:
                    out += f"<li>{sec}</li>"
                out += "</ol>"
            out += "</li>"
        out += "</ul>"
    else:
        out += "<i>No web links found.</i>"

    return out

# gradio
iface = gr.Interface(
    fn=classify_math_question,
    inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
    outputs=gr.HTML(label="Predicted Domains + References"),
    title="⚡ Math Domain Classifier with arXiv + Google",
    description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
)

iface.launch()