from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import arxiv
from semanticscholar import SemanticScholar
import requests


GOOGLE_API_KEY = "AIzaSyAhMzIHz2R5VaHC7uSXcZ9yK4luL0yV3sM"
GOOGLE_CSE_ID = "b2d08ab5820ff465d"

# Load sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Math domain definitions
DOMAINS = {
    "Real Analysis": "Studies properties of real-valued functions, sequences, limits, continuity, differentiation, Riemann/ Lebesgue integration, and convergence in the real number system.",
    "Complex Analysis": "Explores analytic functions of complex variables, contour integration, conformal mappings, and singularity theory.",
    "Functional Analysis": "Deals with infinite-dimensional vector spaces, Banach and Hilbert spaces, linear operators, duality, and spectral theory in the context of functional spaces.",
    "Measure Theory": "Studies sigma-algebras, measures, measurable functions, and integrals, forming the foundation for modern probability and real analysis.",
    "Fourier and Harmonic Analysis": "Analyzes functions via decompositions into sines, cosines, or general orthogonal bases, often involving Fourier series, Fourier transforms, and convolution techniques.",
    "Calculus of Variations": "Optimizes functionals over infinite-dimensional spaces, leading to Euler-Lagrange equations and applications in physics and control theory.",
    "Metric Geometry": "Explores geometric properties of metric spaces and the behavior of functions and sequences under various notions of distance.",
    "Ordinary Differential Equations (ODEs)": "Involves differential equations with functions of a single variable, their qualitative behavior, existence, uniqueness, and methods of solving them.",
    "Partial Differential Equations (PDEs)": "Deals with multivariable functions involving partial derivatives, including wave, heat, and Laplace equations.",
    "Dynamical Systems": "Studies evolution of systems over time using discrete or continuous-time equations, stability theory, phase portraits, and attractors.",
    "Linear Algebra": "Focuses on vector spaces, linear transformations, eigenvalues, diagonalization, and matrices.",
    "Abstract Algebra": "General study of algebraic structures such as groups, rings, fields, and modules.",
    "Group Theory": "Investigates algebraic structures with a single binary operation satisfying group axioms, including symmetry groups and applications.",
    "Ring and Module Theory": "Extends group theory to rings (two operations) and modules (generalized vector spaces).",
    "Field Theory": "Studies field extensions, algebraic and transcendental elements, and classical constructions.",
    "Galois Theory": "Connects field theory and group theory to solve polynomial equations and understand solvability.",
    "Algebraic Number Theory": "Applies tools from abstract algebra to study integers, Diophantine equations, and number fields.",
    "Representation Theory": "Studies abstract algebraic structures by representing their elements as linear transformations of vector spaces.",
    "Algebraic Geometry": "Examines solutions to polynomial equations using geometric and algebraic techniques like varieties, schemes, and morphisms.",
    "Differential Geometry": "Studies geometric structures on smooth manifolds, curvature, geodesics, and applications in general relativity.",
    "Topology": "Analyzes qualitative spatial properties preserved under continuous deformations, including homeomorphism, compactness, and connectedness.",
    "Geometric Topology": "Explores topological manifolds and their classification, knot theory, and low-dimensional topology.",
    "Symplectic Geometry": "Studies geometry arising from Hamiltonian systems and phase space, central to classical mechanics.",
    "Combinatorics": "Covers enumeration, existence, construction, and optimization of discrete structures.",
    "Graph Theory": "Deals with the study of graphs, networks, trees, connectivity, and coloring problems.",
    "Discrete Geometry": "Focuses on geometric objects and combinatorial properties in finite settings, such as polytopes and tilings.",
    "Set Theory": "Studies sets, cardinality, ordinals, ZFC axioms, and independence results.",
    "Mathematical Logic": "Includes propositional logic, predicate logic, proof theory, model theory, and recursion theory.",
    "Category Theory": "Provides a high-level, structural framework to relate different mathematical systems through morphisms and objects.",
    "Probability Theory": "Mathematical foundation for randomness, including random variables, distributions, expectation, and stochastic processes.",
    "Mathematical Statistics": "Theory behind estimation, hypothesis testing, confidence intervals, and likelihood inference.",
    "Stochastic Processes": "Studies processes that evolve with randomness over time, like Markov chains and Brownian motion.",
    "Information Theory": "Analyzes data transmission, entropy, coding theory, and information content in probabilistic settings.",
    "Numerical Analysis": "Designs and analyzes algorithms to approximate solutions of mathematical problems including root-finding, integration, and differential equations.",
    "Optimization": "Studies finding best outcomes under constraints, including convex optimization, linear programming, and integer programming.",
    "Operations Research": "Applies optimization, simulation, and probabilistic modeling to decision-making problems in logistics, finance, and industry.",
    "Control Theory": "Mathematically models and regulates dynamic systems through feedback and optimal control strategies.",
    "Computational Mathematics": "Applies algorithmic and numerical techniques to solve mathematical problems on computers.",
    "Game Theory": "Analyzes strategic interaction among rational agents using payoff matrices and equilibrium concepts.",
    "Machine Learning Theory": "Explores the mathematical foundation of algorithms that learn from data, covering generalization, VC dimension, and convergence.",
    "Spectral Theory": "Studies the spectrum (eigenvalues) of linear operators, primarily in Hilbert/Banach spaces, relevant to quantum mechanics and PDEs.",
    "Operator Theory": "Focuses on properties of linear operators on function spaces and their classification.",
    "Mathematical Physics": "Uses advanced mathematical tools to solve and model problems in physics, often involving differential geometry and functional analysis.",
    "Financial Mathematics": "Applies stochastic calculus and optimization to problems in pricing, risk, and investment.",
    "Mathematics Education": "Focuses on teaching methods, learning theories, and curriculum design in mathematics.",
    "History of Mathematics": "Studies the historical development of mathematical concepts, theorems, and personalities.",
    "Others / Multidisciplinary": "Covers problems that span multiple mathematical areas or do not fall neatly into a traditional domain."
}

# Core Functions
domain_names = list(DOMAINS.keys())
domain_texts = list(DOMAINS.values())
domain_embeddings = model.encode(domain_texts)

def fetch_arxiv_refs(query, max_results=5):
    refs = []
    try:
        search = arxiv.Search(query=query, max_results=max_results)
        for r in search.results():
            refs.append({
                "title": r.title,
                "authors": ", ".join(a.name for a in r.authors[:3]),
                "year": r.published.year,
                "url": r.entry_id,
                "source": "arXiv"
            })
    except:
        pass
    return refs


def fetch_google_cse_links(query, max_results=5):
    url = "https://www.googleapis.com/customsearch/v1"
    links = []
    all_snippets = []
    all_items = []
    try:
        for start in range(1, 100, 100):  # fetch up to 100 results in batches of 100
            params = {
                "q": query,
                "key": GOOGLE_API_KEY,
                "cx": GOOGLE_CSE_ID,
                "num": 10,
                "start": start
            }
            res = requests.get(url, params=params)
            items = res.json().get("items", [])
            if not items:
                break
            for item in items:
                url = item.get("link", "")
                excluded_domains = [
                    "facebook.com", "twitter.com", "instagram.com", "linkedin.com", "tiktok.com",
                    "wolframalpha.com", "symbolab.com", "cymath.com", "mathway.com", "mathsolver.microsoft.com",
                    "photomath.com", "mathpapa.com", "integral-calculator.com", "derivative-calculator.net",
                    "mathportal.org", "stattrek.com", "calculatorsoup.com", "desmos.com", "geogebra.org",
                    "socratic.org", "chegg.com", "quizlet.com"
                ]
                if any(domain in url.lower() for domain in excluded_domains):
                    continue
                snippet = item.get("snippet", "")
                all_items.append({
                    "title": item.get("title", "No Title"),
                    "url": url,
                    "snippet": snippet
                })
                all_snippets.append(snippet)

        if not all_snippets:
            return links

        snippet_embeddings = model.encode(all_snippets)
        used = set()
        for i, emb_i in enumerate(snippet_embeddings):
            if i in used:
                continue
            group = [i]
            for j in range(i + 1, len(snippet_embeddings)):
                if j in used:
                    continue
                sim = cosine_similarity([emb_i], [snippet_embeddings[j]])[0][0]
                if sim > 0.8:
                    group.append(j)
                    used.add(j)
            # Prefer a result with PDF in URL if available
            chosen = None
            for idx in group:
                if ".pdf" in all_items[idx]["url"].lower():
                    chosen = all_items[idx]
                    break
            if not chosen:
                chosen = all_items[group[0]]
            links.append({
                "title": chosen["title"],
                "url": chosen["url"],
                "snippet": chosen["snippet"],
                "source": "Google CSE"
            })
            used.update(group)
            if len(links) >= max_results:
                break
    except Exception as e:
        print("Google CSE Error:", e)
    return links


# Extract top-5 semantically relevant sections from a web page using all-MiniLM-L6-v2
from bs4 import BeautifulSoup
def extract_top_sections_from_url(query, url, top_k=5):
    try:
        res = requests.get(url, timeout=6)
        if res.status_code != 200:
            return []
        soup = BeautifulSoup(res.text, 'html.parser')
        paras = soup.find_all(['p', 'li', 'div'])
        clean_paras = [p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 80]
        if not clean_paras:
            return []
        query_embed = model.encode(query, convert_to_tensor=True)
        para_embeds = model.encode(clean_paras, convert_to_tensor=True)
        sims = cosine_similarity([query_embed], para_embeds)[0]
        top_indices = sims.argsort()[-top_k:][::-1]
        return [clean_paras[i] for i in top_indices]
    except Exception as e:
        print(f"Error extracting from {url}: {e}")
        return []

# Output
def classify_math_question(question):
    q_embed = model.encode([question])
    scores = cosine_similarity(q_embed, domain_embeddings)[0]
    sorted_indices = scores.argsort()[::-1]
    major = domain_names[sorted_indices[0]]
    minor = domain_names[sorted_indices[1]]
    major_reason = DOMAINS[major]
    minor_reason = DOMAINS[minor]

    out = f"<b>Major Domain:</b> {major}<br><i>Reason:</i> {major_reason}<br><br>"
    out += f"<b>Minor Domain:</b> {minor}<br><i>Reason:</i> {minor_reason}<br><br>"

    refs = fetch_arxiv_refs(question, max_results=5)
    links = fetch_google_cse_links(question, max_results=5)
    all_links = links

    # Enrich links with top-5 extracted sections
    enriched_links = []
    for link in all_links:
        top_sections = extract_top_sections_from_url(question, link['url'])
        link['top_sections'] = top_sections
        enriched_links.append(link)

    if refs:
        out += "<b>Top Academic References (arXiv):</b><ul>"
        for p in refs:
            out += f"<li><b>{p['title']}</b> ({p['year']}) - <i>{p['authors']}</i><br><a href='{p['url']}' target='_blank'>{p['url']}</a></li>"
        out += "</ul>"
    else:
        out += "<i>No academic references found.</i><br>"

    if enriched_links:
        out += "<b>Top Web Resources (Google CSE):</b><ul>"
        for link in enriched_links:
            out += f"<li><b>{link['title']}</b><br>{link['snippet']}<br><a href='{link['url']}' target='_blank'>{link['url']}</a>"
            if link['top_sections']:
                out += "<br><u>Top Extracted Sections:</u><ol>"
                for sec in link['top_sections']:
                    out += f"<li>{sec}</li>"
                out += "</ol>"
            out += "</li>"
        out += "</ul>"
    else:
        out += "<i>No web links found.</i>"

    return out

# gradio
iface = gr.Interface(
    fn=classify_math_question,
    inputs=gr.Textbox(lines=5, label="Enter Math Question (LaTeX supported)"),
    outputs=gr.HTML(label="Predicted Domains + References"),
    title="⚡ Math Domain Classifier with arXiv + Google",
    description="Classifies math problems into major/minor domains and fetches fast references from arXiv and Google."
)

iface.launch()