Fetch / app.py
Nymbo's picture
Update app.py
39ae379 verified
raw
history blame
7.52 kB
# File: app.py
# Purpose: Fetch only the readable text from a web page and return it as Markdown
# Notes: This version is more efficient and user-friendly than returning raw HTML.
import re
import time
import gradio as gr
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup # used as a fallback cleaner
from readability import Document # isolates the "main content" like reader view
import html2text # converts HTML to Markdown
# ----------------------------
# Simple in-memory cache (tiny LRU-ish)
# ----------------------------
# layman's terms: we remember recent results so repeated requests for the same URL are instant
_CACHE = {}
_CACHE_ORDER = []
_CACHE_MAX = 64
_CACHE_TTL_SECONDS = 10 * 60 # 10 minutes
def _cache_get(key):
# layman's terms: give me the saved value if it's still fresh
item = _CACHE.get(key)
if not item:
return None
value, ts = item
if time.time() - ts > _CACHE_TTL_SECONDS:
_CACHE.pop(key, None)
return None
# refresh order
if key in _CACHE_ORDER:
_CACHE_ORDER.remove(key)
_CACHE_ORDER.append(key)
return value
def _cache_set(key, value):
# layman's terms: save a result and keep the list from growing too large
_CACHE[key] = (value, time.time())
if key in _CACHE_ORDER:
_CACHE_ORDER.remove(key)
_CACHE_ORDER.append(key)
while len(_CACHE_ORDER) > _CACHE_MAX:
oldest = _CACHE_ORDER.pop(0)
_CACHE.pop(oldest, None)
# ----------------------------
# Helpers
# ----------------------------
def _normalize_url(url: str) -> str:
"""
layman's terms: if the user forgot 'https://', add it.
"""
url = url.strip()
parsed = urlparse(url)
if not parsed.scheme:
url = "https://" + url
return url
def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
"""
layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
"""
try:
head = requests.head(
url,
allow_redirects=True,
timeout=(5, 10),
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml",
"Accept-Encoding": "gzip, deflate, br",
},
)
size = head.headers.get("Content-Length")
if size and size.isdigit():
return int(size) > max_bytes
except requests.exceptions.RequestException:
# layman's terms: if HEAD fails, we won't block the GET just because of that
pass
return False
def _fetch_html(url: str) -> str:
"""
layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
"""
resp = requests.get(
url,
timeout=(5, 20), # connect, read
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.8",
},
)
resp.raise_for_status()
# Only proceed for text/html payloads
ctype = resp.headers.get("Content-Type", "")
if "text/html" not in ctype.lower():
# layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
return resp.text
# Respect declared encoding where possible
resp.encoding = resp.encoding or "utf-8"
return resp.text
def _extract_main_html(html: str) -> str:
"""
layman's terms: use reader mode (Readability) to isolate the main article/body content.
Falls back to stripping scripts/styles if Readability can't find a core.
"""
try:
doc = Document(html)
main_html = doc.summary(html_partial=True) # main content as HTML
# Make sure we still have something useful
if main_html and len(main_html) > 40:
return main_html
except Exception:
pass
# Fallback: strip scripts/styles and return a body-only HTML
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body = soup.body or soup
return str(body)
def _html_to_markdown(html: str) -> str:
"""
layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
"""
h = html2text.HTML2Text()
h.ignore_images = True # don't inline images in Markdown
h.ignore_links = False # keep links as [text](url)
h.body_width = 0 # don't hard-wrap lines
h.protect_links = True
h.single_line_break = True
md = h.handle(html)
# Tidy up excessive blank lines/whitespace
md = re.sub(r"\n{3,}", "\n\n", md).strip()
return md or "_No readable text found on this page._"
# ----------------------------
# Main callable for Gradio
# ----------------------------
def fetch_markdown(url: str) -> str:
"""
layman's terms: the function the UI calls.
Steps:
1) sanitize the URL
2) quick HEAD check to avoid massive pages
3) GET the HTML
4) isolate the main content
5) convert to Markdown
6) return Markdown
"""
if not url or not url.strip():
return "_Please enter a URL._"
try:
url = _normalize_url(url)
# Return cached value if available
cached = _cache_get(url)
if cached:
return cached
# Optional efficiency: skip very large pages before downloading
if _too_large_via_head(url):
return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
html = _fetch_html(url)
# If server returned non-HTML (e.g., JSON), just code-fence it
if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
# This condition is a no-op; we already content-typed in _fetch_html.
pass
main_html = _extract_main_html(html)
markdown = _html_to_markdown(main_html)
_cache_set(url, markdown)
return markdown
except requests.exceptions.RequestException as e:
# layman's terms: network or HTTP error
return f"_Network error: {e}_"
except Exception as e:
# layman's terms: any other unexpected error
return f"_Unexpected error: {e}_"
# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
# layman's terms: a simple, centered header explaining what this tool does
gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
with gr.Row():
url_box = gr.Textbox(
label="URL",
placeholder="example.com or https://example.com/article",
)
fetch_btn = gr.Button("Fetch")
# layman's terms: show the result as rendered Markdown (not a plain textbox)
output_md = gr.Markdown(label="Readable Markdown")
# layman's terms: helpful example URLs to try with one click
gr.Examples(
examples=[
["https://en.wikipedia.org/wiki/Hugging_Face"],
["https://huggingface.co/blog"],
["https://www.bbc.com/news"],
],
inputs=[url_box],
)
fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
if __name__ == "__main__":
demo.launch(mcp_server=True)