Spaces:

Nymbo
/

Web

Sleeping

Web

File size: 12,748 Bytes

a655b89

# File: main/app.py
# Purpose: One Space that offers two tools:
#          1) Fetch: extract relevant page content (title, metadata, clean text, hyperlinks)
#          2) Websearch: DuckDuckGo web search
#
# Notes:
# - Launched with mcp_server=True so both functions are available as MCP tools.
# - UI uses TabbedInterface so you can use each tool from its own tab.
# - Inline comments explain each section in plain language.

from __future__ import annotations

import re                                         # (layman) used to tidy up whitespace
from typing import List, Dict, Literal, Tuple

import gradio as gr                               # (layman) the UI framework
import requests                                   # (layman) to download web pages
from bs4 import BeautifulSoup                     # (layman) for parsing HTML
from readability import Document                  # (layman) to isolate main readable content
from urllib.parse import urljoin, urldefrag, urlparse  # (layman) to fix/clean URLs

# DuckDuckGo via LangChain community tool
from langchain_community.tools import DuckDuckGoSearchResults


# ==============================
# Fetch: HTTP + extraction utils
# ==============================

def _http_get(url: str) -> requests.Response:
    """
    (layman) Download the page politely with a short timeout and realistic headers.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    return requests.get(url, headers=headers, timeout=15)


def _normalize_whitespace(text: str) -> str:
    """
    (layman) Squeeze extra spaces and blank lines to keep things compact.
    """
    text = re.sub(r"[ \t\u00A0]+", " ", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
    return text.strip()


def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
    """
    (layman) Cut text if it gets too long; return the text and whether we trimmed.
    """
    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
        return text, False
    return text[:max_chars].rstrip() + " …", True


def _domain_of(url: str) -> str:
    """
    (layman) Show a friendly site name like "example.com".
    """
    try:
        return urlparse(url).netloc or ""
    except Exception:
        return ""


def _meta(soup: BeautifulSoup, name: str) -> str | None:
    tag = soup.find("meta", attrs={"name": name})
    return tag.get("content") if tag and tag.has_attr("content") else None


def _og(soup: BeautifulSoup, prop: str) -> str | None:
    tag = soup.find("meta", attrs={"property": prop})
    return tag.get("content") if tag and tag.has_attr("content") else None


def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
    """
    (layman) Pull the useful bits: title, description, site name, canonical URL, language, etc.
    """
    meta: Dict[str, str] = {}

    # Title preference: <title> > og:title > twitter:title
    title_candidates = [
        (soup.title.string if soup.title and soup.title.string else None),
        _og(soup, "og:title"),
        _meta(soup, "twitter:title"),
    ]
    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")

    # Description preference: description > og:description > twitter:description
    desc_candidates = [
        _meta(soup, "description"),
        _og(soup, "og:description"),
        _meta(soup, "twitter:description"),
    ]
    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")

    # Canonical link (helps dedupe)
    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""

    # Site name + language info if present
    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
    html_tag = soup.find("html")
    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""

    # Final URL + domain
    meta["fetched_url"] = final_url
    meta["domain"] = _domain_of(final_url)

    return meta


def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
    """
    (layman) Use Readability to isolate the main article and turn it into clean text.
    Returns (clean_text, soup_of_readable_html).
    """
    # Simplified article HTML from Readability
    doc = Document(html)
    readable_html = doc.summary(html_partial=True)

    # Parse simplified HTML
    s = BeautifulSoup(readable_html, "lxml")

    # Remove noisy tags
    for sel in ["script", "style", "noscript", "iframe", "svg"]:
        for tag in s.select(sel):
            tag.decompose()

    # Keep paragraphs, list items, and subheadings for structure without bloat
    text_parts: List[str] = []
    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
        chunk = p.get_text(" ", strip=True)
        if chunk:
            text_parts.append(chunk)

    clean_text = _normalize_whitespace("\n\n".join(text_parts))
    return clean_text, s


def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
    """
    (layman) Collect clean, unique, absolute links from the readable section only.
    """
    seen = set()
    links: List[Tuple[str, str]] = []

    for a in readable_soup.find_all("a", href=True):
        href = a.get("href").strip()
        # Skip junk links we can't use
        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
            continue

        # Resolve relative URLs, strip fragments (#…)
        absolute = urljoin(base_url, href)
        absolute, _ = urldefrag(absolute)

        if absolute in seen:
            continue
        seen.add(absolute)

        text = a.get_text(" ", strip=True)
        if len(text) > 120:
            text = text[:117] + "…"

        links.append((text or absolute, absolute))

        if len(links) >= max_links > 0:
            break

    return links


def _format_markdown(
    meta: Dict[str, str],
    body: str,
    body_truncated: bool,
    links: List[Tuple[str, str]],
    include_text: bool,
    include_metadata: bool,
    include_links: bool,
    verbosity: str,
) -> str:
    """
    (layman) Assemble a compact Markdown summary with optional sections.
    """
    lines: List[str] = []

    # Title header
    title = meta.get("title") or meta.get("domain") or "Untitled"
    lines.append(f"# {title}")

    # Metadata section (only show what exists)
    if include_metadata:
        md: List[str] = []
        if meta.get("description"):
            md.append(f"- **Description:** {meta['description']}")
        if meta.get("site_name"):
            md.append(f"- **Site:** {meta['site_name']}")
        if meta.get("canonical"):
            md.append(f"- **Canonical:** {meta['canonical']}")
        if meta.get("lang"):
            md.append(f"- **Language:** {meta['lang']}")
        if meta.get("fetched_url"):
            md.append(f"- **Fetched From:** {meta['fetched_url']}")
        if md:
            lines.append("## Metadata")
            lines.extend(md)

    # Body text
    if include_text and body:
        if verbosity == "Brief":
            brief, was_more = _truncate(body, 800)
            lines.append("## Text")
            lines.append(brief)
            if was_more or body_truncated:
                lines.append("\n> (Trimmed for brevity)")
        else:
            lines.append("## Text")
            lines.append(body)
            if body_truncated:
                lines.append("\n> (Trimmed for brevity)")

    # Links section
    if include_links and links:
        lines.append(f"## Links ({len(links)})")
        for text, url in links:
            lines.append(f"- [{text}]({url})")

    return "\n\n".join(lines).strip()


def extract_relevant(  # <-- MCP tool #1
    url: str,
    verbosity: str = "Standard",
    include_metadata: bool = True,
    include_text: bool = True,
    include_links: bool = True,
    max_chars: int = 3000,
    max_links: int = 20,
) -> str:
    """
    (layman) Given a URL, return a tight Markdown summary: title, key metadata, readable text, and links.
    """
    if not url or not url.strip():
        return "Please enter a valid URL."

    try:
        resp = _http_get(url)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

    final_url = str(resp.url)
    ctype = resp.headers.get("Content-Type", "")
    if "html" not in ctype.lower():
        return f"Unsupported content type for extraction: {ctype or 'unknown'}"

    # Decode to text
    resp.encoding = resp.encoding or resp.apparent_encoding
    html = resp.text

    # Full-page soup for metadata
    full_soup = BeautifulSoup(html, "lxml")
    meta = _extract_metadata(full_soup, final_url)

    # Readable content
    body_text, readable_soup = _extract_main_text(html)
    if not body_text:
        # Fallback to "whole-page text" if Readability found nothing
        fallback_text = full_soup.get_text(" ", strip=True)
        body_text = _normalize_whitespace(fallback_text)

    # Verbosity presets (we keep the smaller of preset vs. user cap)
    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
    target_cap = preset_caps.get(verbosity, 3000)
    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)

    # Extract links from the simplified content only
    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)

    # Final compact Markdown
    md = _format_markdown(
        meta=meta,
        body=body_text,
        body_truncated=truncated,
        links=links,
        include_text=include_text,
        include_metadata=include_metadata,
        include_links=include_links,
        verbosity=verbosity,
    )
    return md or "No content could be extracted."


# ==========================
# Websearch: DuckDuckGo tool
# ==========================

def web_search(  # <-- MCP tool #2
    input_query: str,
    max_results: int = 5,
) -> List[Dict[Literal["snippet", "title", "link"], str]]:
    """
    (layman) Run a DuckDuckGo search and return a list of {snippet, title, link}.
    """
    if not input_query or not input_query.strip():
        return []

    # Create the search tool (LangChain community wrapper)
    search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)

    # Run the search and return results as a list of dicts
    results = search.invoke(input_query)
    return results


# =====================
# UI: two-tab interface
# =====================

# --- Fetch tab (compact controllable extraction) ---
fetch_interface = gr.Interface(
    fn=extract_relevant,  # (layman) connect the function to the UI
    inputs=[
        gr.Textbox(label="URL", placeholder="https://example.com/article"),
        gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
        gr.Checkbox(value=True, label="Include Metadata"),
        gr.Checkbox(value=True, label="Include Main Text"),
        gr.Checkbox(value=True, label="Include Links"),
        gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
        gr.Slider(0, 100, value=20, step=1, label="Max Links"),
    ],
    outputs=gr.Markdown(label="Extracted Summary"),
    title="Fetch — Clean Extract",
    description="Extract title, key metadata, readable text, and links. No noisy HTML.",
    allow_flagging="never",
    theme="Nymbo/Nymbo_Theme",
)

# --- Websearch tab (DuckDuckGo) ---
websearch_interface = gr.Interface(
    fn=web_search,  # (layman) connect the function to the UI
    inputs=[
        gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
        gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
    ],
    outputs=gr.JSON(label="Search results"),
    title="Websearch — DuckDuckGo",
    description="Search the web using DuckDuckGo; returns snippet, title, and link.",
    allow_flagging="never",
    theme="Nymbo/Nymbo_Theme",
)

# --- Combine both into a single app with tabs ---
demo = gr.TabbedInterface(
    interface_list=[fetch_interface, websearch_interface],
    tab_names=["Fetch", "Websearch"],
    title="Web MCP — Fetch + Websearch",
    theme="Nymbo/Nymbo_Theme",
)

# Launch the UI and expose both functions as MCP tools in one server
if __name__ == "__main__":
    demo.launch(mcp_server=True)