Spaces:

Nymbo
/

Fetch

Running

File size: 13,098 Bytes

# File: main/app.py
# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
#          instead of returning full HTML. Output is compact and configurable to reduce verbosity.

import gradio as gr                        # UI library
import requests                            # HTTP client
from bs4 import BeautifulSoup              # HTML parsing
from readability import Document           # Readability algorithm to isolate main content
from urllib.parse import urljoin, urldefrag, urlparse  # URL helpers
import re                                  # For whitespace cleanup and simple formatting


# -------------------------------
# HTTP fetching with sane defaults
# -------------------------------
def _http_get(url: str) -> requests.Response:
    """
    Make an HTTP GET request with headers and a timeout.
    Layman's terms: downloads the webpage safely and politely.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    # Short timeouts so the app isn't stuck forever
    return requests.get(url, headers=headers, timeout=15)


# ----------------------------------------
# Helpers: text cleanup & friendly trimming
# ----------------------------------------
def _normalize_whitespace(text: str) -> str:
    """
    Layman's terms: squash weird spacing and too many blank lines.
    """
    text = re.sub(r"[ \t\u00A0]+", " ", text)               # collapse runs of spaces
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())   # max 1 blank line at a time
    return text.strip()


def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
    """
    Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
    """
    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
        return text, False
    return text[:max_chars].rstrip() + " …", True


def _domain_of(url: str) -> str:
    """
    Layman's terms: show a friendly domain like example.com.
    """
    try:
        return urlparse(url).netloc or ""
    except Exception:
        return ""


# -----------------------------------
# Metadata extraction (title, etc.)
# -----------------------------------
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
    """
    Layman's terms: grab useful fields like title, description, site name, and canonical link.
    """
    meta = {}

    # Title preference: <title> > og:title > twitter:title
    title_candidates = [
        (soup.title.string if soup.title and soup.title.string else None),
        _og(soup, "og:title"),
        _meta(soup, "twitter:title"),
    ]
    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")

    # Description preference: meta[name=description] > og:description > twitter:description
    desc_candidates = [
        _meta(soup, "description"),
        _og(soup, "og:description"),
        _meta(soup, "twitter:description"),
    ]
    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")

    # Canonical URL if provided (helps dedupe / standardize)
    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""

    # Site name (nice for context)
    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()

    # Language (if present)
    html_tag = soup.find("html")
    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""

    # Final resolved URL and domain
    meta["fetched_url"] = final_url
    meta["domain"] = _domain_of(final_url)

    return meta


def _meta(soup: BeautifulSoup, name: str) -> str | None:
    tag = soup.find("meta", attrs={"name": name})
    return tag.get("content") if tag and tag.has_attr("content") else None


def _og(soup: BeautifulSoup, prop: str) -> str | None:
    tag = soup.find("meta", attrs={"property": prop})
    return tag.get("content") if tag and tag.has_attr("content") else None


# ---------------------------------------------------------
# Main content extraction with Readability + gentle cleanup
# ---------------------------------------------------------
def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
    """
    Layman's terms: use Readability to find the article body, then clean it to plain text.
    Returns (clean_text, soup_of_readable_html) for link scraping.
    """
    # Readability gives us a simplified article HTML
    doc = Document(html)
    readable_html = doc.summary(html_partial=True)

    # Parse the simplified HTML so we can clean it up further
    s = BeautifulSoup(readable_html, "lxml")

    # Remove obviously noisy elements if present
    for sel in ["script", "style", "noscript", "iframe", "svg"]:
        for tag in s.select(sel):
            tag.decompose()

    # Extract text with paragraphs preserved, then normalize whitespace
    text_parts = []
    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
        # Keep list items and headers to retain structure without being too verbose
        chunk = p.get_text(" ", strip=True)
        if chunk:
            text_parts.append(chunk)

    clean_text = _normalize_whitespace("\n\n".join(text_parts))
    return clean_text, s


# ------------------------------------------
# Link extraction from the simplified content
# ------------------------------------------
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
    """
    Layman's terms: pull out clickable links from the article content only,
    turn them into absolute URLs, drop junk, dedupe, and cap the list.
    """
    seen = set()
    links: list[tuple[str, str]] = []

    for a in readable_soup.find_all("a", href=True):
        href = a.get("href").strip()
        # Ignore anchors, mailto, javascript, and empty
        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
            continue

        # Resolve relative URLs and strip URL fragments (#section)
        absolute = urljoin(base_url, href)
        absolute, _ = urldefrag(absolute)

        if absolute in seen:
            continue
        seen.add(absolute)

        text = a.get_text(" ", strip=True)
        # Keep link text concise
        if len(text) > 120:
            text = text[:117] + "…"

        links.append((text or absolute, absolute))

        if len(links) >= max_links > 0:
            break

    return links


# -------------------------
# Formatter: compact output
# -------------------------
def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
                     include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
    """
    Layman's terms: turn the pieces into a neat, compact Markdown string.
    """
    lines = []

    # Title header
    title = meta.get("title") or meta.get("domain") or "Untitled"
    lines.append(f"# {title}")

    # Metadata (compact)
    if include_metadata:
        md = []
        # Only show fields that exist to keep things tight
        if meta.get("description"):
            md.append(f"- **Description:** {meta['description']}")
        if meta.get("site_name"):
            md.append(f"- **Site:** {meta['site_name']}")
        if meta.get("canonical"):
            md.append(f"- **Canonical:** {meta['canonical']}")
        if meta.get("lang"):
            md.append(f"- **Language:** {meta['lang']}")
        if meta.get("fetched_url"):
            md.append(f"- **Fetched From:** {meta['fetched_url']}")

        if md:
            lines.append("## Metadata")
            lines.extend(md)

    # Body text
    if include_text and body:
        # For "Brief", show a very small excerpt even after truncation
        if verbosity == "Brief":
            brief, was_more = _truncate(body, 800)
            lines.append("## Text")
            lines.append(brief)
            if was_more or body_truncated:
                lines.append("\n> (Trimmed for brevity)")
        else:
            lines.append("## Text")
            lines.append(body)
            if body_truncated:
                lines.append("\n> (Trimmed for brevity)")

    # Links
    if include_links and links:
        lines.append(f"## Links ({len(links)})")
        for text, url in links:
            lines.append(f"- [{text}]({url})")

    return "\n\n".join(lines).strip()


# --------------------------------
# Gradio-facing function (the app)
# --------------------------------
def extract_relevant(
    url: str,
    verbosity: str = "Standard",
    include_metadata: bool = True,
    include_text: bool = True,
    include_links: bool = True,
    max_chars: int = 3000,
    max_links: int = 20
) -> str:
    """
    Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
    """
    if not url or not url.strip():
        return "Please enter a valid URL."

    try:
        resp = _http_get(url)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

    # Respect the final resolved URL (after redirects)
    final_url = str(resp.url)

    # Only process HTML-ish responses
    ctype = resp.headers.get("Content-Type", "")
    if "html" not in ctype.lower():
        return f"Unsupported content type for extraction: {ctype or 'unknown'}"

    # Decode as text (requests usually sets encoding; otherwise guess)
    resp.encoding = resp.encoding or resp.apparent_encoding
    html = resp.text

    # Full page soup (to extract metadata accurately)
    full_soup = BeautifulSoup(html, "lxml")
    meta = _extract_metadata(full_soup, final_url)

    # Extract main body text using Readability
    body_text, readable_soup = _extract_main_text(html)

    # If the body is suspiciously empty, fall back to a simpler text strategy
    if not body_text:
        fallback_text = full_soup.get_text(" ", strip=True)
        body_text = _normalize_whitespace(fallback_text)

    # Enforce verbosity presets unless user overrides via slider
    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
    target_cap = preset_caps.get(verbosity, 3000)
    # Use the *smaller* of user cap and preset to keep things tidy
    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)

    # Extract links from the readable portion only (cleaner than whole DOM)
    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)

    # Build compact Markdown
    md = _format_markdown(
        meta=meta,
        body=body_text,
        body_truncated=truncated,
        links=links,
        include_text=include_text,
        include_metadata=include_metadata,
        include_links=include_links,
        verbosity=verbosity
    )
    return md or "No content could be extracted."


# -----------------
# Gradio UI (Blocks)
# -----------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    # Title & subtitle for clarity
    gr.Markdown("# Fetch MCP — Clean Extract")
    gr.Markdown(
        "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
        "Use Verbosity and caps to keep it tight."
    )

    with gr.Row():
        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
        fetch_btn = gr.Button("Fetch Clean Content")

    with gr.Accordion("Options", open=False):
        with gr.Row():
            verbosity = gr.Dropdown(
                label="Verbosity",
                choices=["Brief", "Standard", "Full"],
                value="Standard",
                info="Controls how much text you get back."
            )
            max_chars = gr.Slider(
                400, 12000, value=3000, step=100,
                label="Max Characters (body text)",
                info="Hard cap for body text. Lower = less verbose."
            )
            max_links = gr.Slider(
                0, 100, value=20, step=1,
                label="Max Links",
                info="Limit how many hyperlinks we include."
            )
        with gr.Row():
            include_metadata = gr.Checkbox(value=True, label="Include Metadata")
            include_text = gr.Checkbox(value=True, label="Include Main Text")
            include_links = gr.Checkbox(value=True, label="Include Links")

    # Output as Markdown (compact and readable)
    out = gr.Markdown(label="Result")

    # Wire up the click
    fetch_btn.click(
        fn=extract_relevant,
        inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
        outputs=out
    )

# Keep MCP server enabled
if __name__ == "__main__":
    demo.launch(mcp_server=True)