Spaces:

Nymbo
/

Fetch

Running

File size: 7,524 Bytes

# File: app.py
# Purpose: Fetch only the readable text from a web page and return it as Markdown
# Notes: This version is more efficient and user-friendly than returning raw HTML.

import re
import time
import gradio as gr
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup  # used as a fallback cleaner
from readability import Document  # isolates the "main content" like reader view
import html2text  # converts HTML to Markdown

# ----------------------------
# Simple in-memory cache (tiny LRU-ish)
# ----------------------------
# layman's terms: we remember recent results so repeated requests for the same URL are instant
_CACHE = {}
_CACHE_ORDER = []
_CACHE_MAX = 64
_CACHE_TTL_SECONDS = 10 * 60  # 10 minutes

def _cache_get(key):
    # layman's terms: give me the saved value if it's still fresh
    item = _CACHE.get(key)
    if not item:
        return None
    value, ts = item
    if time.time() - ts > _CACHE_TTL_SECONDS:
        _CACHE.pop(key, None)
        return None
    # refresh order
    if key in _CACHE_ORDER:
        _CACHE_ORDER.remove(key)
    _CACHE_ORDER.append(key)
    return value

def _cache_set(key, value):
    # layman's terms: save a result and keep the list from growing too large
    _CACHE[key] = (value, time.time())
    if key in _CACHE_ORDER:
        _CACHE_ORDER.remove(key)
    _CACHE_ORDER.append(key)
    while len(_CACHE_ORDER) > _CACHE_MAX:
        oldest = _CACHE_ORDER.pop(0)
        _CACHE.pop(oldest, None)

# ----------------------------
# Helpers
# ----------------------------

def _normalize_url(url: str) -> str:
    """
    layman's terms: if the user forgot 'https://', add it.
    """
    url = url.strip()
    parsed = urlparse(url)
    if not parsed.scheme:
        url = "https://" + url
    return url

def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
    """
    layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
    """
    try:
        head = requests.head(
            url,
            allow_redirects=True,
            timeout=(5, 10),
            headers={
                "User-Agent": "Mozilla/5.0",
                "Accept": "text/html,application/xhtml+xml",
                "Accept-Encoding": "gzip, deflate, br",
            },
        )
        size = head.headers.get("Content-Length")
        if size and size.isdigit():
            return int(size) > max_bytes
    except requests.exceptions.RequestException:
        # layman's terms: if HEAD fails, we won't block the GET just because of that
        pass
    return False

def _fetch_html(url: str) -> str:
    """
    layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
    """
    resp = requests.get(
        url,
        timeout=(5, 20),  # connect, read
        headers={
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html,application/xhtml+xml",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.8",
        },
    )
    resp.raise_for_status()

    # Only proceed for text/html payloads
    ctype = resp.headers.get("Content-Type", "")
    if "text/html" not in ctype.lower():
        # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
        return resp.text

    # Respect declared encoding where possible
    resp.encoding = resp.encoding or "utf-8"
    return resp.text

def _extract_main_html(html: str) -> str:
    """
    layman's terms: use reader mode (Readability) to isolate the main article/body content.
    Falls back to stripping scripts/styles if Readability can't find a core.
    """
    try:
        doc = Document(html)
        main_html = doc.summary(html_partial=True)  # main content as HTML
        # Make sure we still have something useful
        if main_html and len(main_html) > 40:
            return main_html
    except Exception:
        pass

    # Fallback: strip scripts/styles and return a body-only HTML
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    body = soup.body or soup
    return str(body)

def _html_to_markdown(html: str) -> str:
    """
    layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
    """
    h = html2text.HTML2Text()
    h.ignore_images = True          # don't inline images in Markdown
    h.ignore_links = False          # keep links as [text](url)
    h.body_width = 0                # don't hard-wrap lines
    h.protect_links = True
    h.single_line_break = True
    md = h.handle(html)

    # Tidy up excessive blank lines/whitespace
    md = re.sub(r"\n{3,}", "\n\n", md).strip()
    return md or "_No readable text found on this page._"

# ----------------------------
# Main callable for Gradio
# ----------------------------

def fetch_markdown(url: str) -> str:
    """
    layman's terms: the function the UI calls.
    Steps:
      1) sanitize the URL
      2) quick HEAD check to avoid massive pages
      3) GET the HTML
      4) isolate the main content
      5) convert to Markdown
      6) return Markdown
    """
    if not url or not url.strip():
        return "_Please enter a URL._"

    try:
        url = _normalize_url(url)

        # Return cached value if available
        cached = _cache_get(url)
        if cached:
            return cached

        # Optional efficiency: skip very large pages before downloading
        if _too_large_via_head(url):
            return "_The page is too large to fetch efficiently (over ~2.5 MB)._"

        html = _fetch_html(url)
        # If server returned non-HTML (e.g., JSON), just code-fence it
        if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
            # This condition is a no-op; we already content-typed in _fetch_html.
            pass

        main_html = _extract_main_html(html)
        markdown = _html_to_markdown(main_html)

        _cache_set(url, markdown)
        return markdown

    except requests.exceptions.RequestException as e:
        # layman's terms: network or HTTP error
        return f"_Network error: {e}_"
    except Exception as e:
        # layman's terms: any other unexpected error
        return f"_Unexpected error: {e}_"

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
    # layman's terms: a simple, centered header explaining what this tool does
    gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")

    with gr.Row():
        url_box = gr.Textbox(
            label="URL",
            placeholder="example.com or https://example.com/article",
        )
        fetch_btn = gr.Button("Fetch")

    # layman's terms: show the result as rendered Markdown (not a plain textbox)
    output_md = gr.Markdown(label="Readable Markdown")

    # layman's terms: helpful example URLs to try with one click
    gr.Examples(
        examples=[
            ["https://en.wikipedia.org/wiki/Hugging_Face"],
            ["https://huggingface.co/blog"],
            ["https://www.bbc.com/news"],
        ],
        inputs=[url_box],
    )

    fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
    url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)

if __name__ == "__main__":
    demo.launch(mcp_server=True)