Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Fetch / app.py

Nymbo

Update app.py

39ae379 verified 26 days ago

raw

history blame

7.52 kB

	# File: app.py
	# Purpose: Fetch only the readable text from a web page and return it as Markdown
	# Notes: This version is more efficient and user-friendly than returning raw HTML.

	import re
	import time
	import gradio as gr
	import requests
	from urllib.parse import urlparse
	from bs4 import BeautifulSoup # used as a fallback cleaner
	from readability import Document # isolates the "main content" like reader view
	import html2text # converts HTML to Markdown

	# ----------------------------
	# Simple in-memory cache (tiny LRU-ish)
	# ----------------------------
	# layman's terms: we remember recent results so repeated requests for the same URL are instant
	_CACHE = {}
	_CACHE_ORDER = []
	_CACHE_MAX = 64
	_CACHE_TTL_SECONDS = 10 * 60 # 10 minutes

	def _cache_get(key):
	# layman's terms: give me the saved value if it's still fresh
	item = _CACHE.get(key)
	if not item:
	return None
	value, ts = item
	if time.time() - ts > _CACHE_TTL_SECONDS:
	_CACHE.pop(key, None)
	return None
	# refresh order
	if key in _CACHE_ORDER:
	_CACHE_ORDER.remove(key)
	_CACHE_ORDER.append(key)
	return value

	def _cache_set(key, value):
	# layman's terms: save a result and keep the list from growing too large
	_CACHE[key] = (value, time.time())
	if key in _CACHE_ORDER:
	_CACHE_ORDER.remove(key)
	_CACHE_ORDER.append(key)
	while len(_CACHE_ORDER) > _CACHE_MAX:
	oldest = _CACHE_ORDER.pop(0)
	_CACHE.pop(oldest, None)

	# ----------------------------
	# Helpers
	# ----------------------------

	def _normalize_url(url: str) -> str:
	"""
	layman's terms: if the user forgot 'https://', add it.
	"""
	url = url.strip()
	parsed = urlparse(url)
	if not parsed.scheme:
	url = "https://" + url
	return url

	def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
	"""
	layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
	"""
	try:
	head = requests.head(
	url,
	allow_redirects=True,
	timeout=(5, 10),
	headers={
	"User-Agent": "Mozilla/5.0",
	"Accept": "text/html,application/xhtml+xml",
	"Accept-Encoding": "gzip, deflate, br",
	},
	)
	size = head.headers.get("Content-Length")
	if size and size.isdigit():
	return int(size) > max_bytes
	except requests.exceptions.RequestException:
	# layman's terms: if HEAD fails, we won't block the GET just because of that
	pass
	return False

	def _fetch_html(url: str) -> str:
	"""
	layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
	"""
	resp = requests.get(
	url,
	timeout=(5, 20), # connect, read
	headers={
	"User-Agent": "Mozilla/5.0",
	"Accept": "text/html,application/xhtml+xml",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept-Language": "en-US,en;q=0.8",
	},
	)
	resp.raise_for_status()

	# Only proceed for text/html payloads
	ctype = resp.headers.get("Content-Type", "")
	if "text/html" not in ctype.lower():
	# layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
	return resp.text

	# Respect declared encoding where possible
	resp.encoding = resp.encoding or "utf-8"
	return resp.text

	def _extract_main_html(html: str) -> str:
	"""
	layman's terms: use reader mode (Readability) to isolate the main article/body content.
	Falls back to stripping scripts/styles if Readability can't find a core.
	"""
	try:
	doc = Document(html)
	main_html = doc.summary(html_partial=True) # main content as HTML
	# Make sure we still have something useful
	if main_html and len(main_html) > 40:
	return main_html
	except Exception:
	pass

	# Fallback: strip scripts/styles and return a body-only HTML
	soup = BeautifulSoup(html, "html.parser")
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()
	body = soup.body or soup
	return str(body)

	def _html_to_markdown(html: str) -> str:
	"""
	layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
	"""
	h = html2text.HTML2Text()
	h.ignore_images = True # don't inline images in Markdown
	h.ignore_links = False # keep links as [text](url)
	h.body_width = 0 # don't hard-wrap lines
	h.protect_links = True
	h.single_line_break = True
	md = h.handle(html)

	# Tidy up excessive blank lines/whitespace
	md = re.sub(r"\n{3,}", "\n\n", md).strip()
	return md or "_No readable text found on this page._"

	# ----------------------------
	# Main callable for Gradio
	# ----------------------------

	def fetch_markdown(url: str) -> str:
	"""
	layman's terms: the function the UI calls.
	Steps:
	1) sanitize the URL
	2) quick HEAD check to avoid massive pages
	3) GET the HTML
	4) isolate the main content
	5) convert to Markdown
	6) return Markdown
	"""
	if not url or not url.strip():
	return "_Please enter a URL._"

	try:
	url = _normalize_url(url)

	# Return cached value if available
	cached = _cache_get(url)
	if cached:
	return cached

	# Optional efficiency: skip very large pages before downloading
	if _too_large_via_head(url):
	return "_The page is too large to fetch efficiently (over ~2.5 MB)._"

	html = _fetch_html(url)
	# If server returned non-HTML (e.g., JSON), just code-fence it
	if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
	# This condition is a no-op; we already content-typed in _fetch_html.
	pass

	main_html = _extract_main_html(html)
	markdown = _html_to_markdown(main_html)

	_cache_set(url, markdown)
	return markdown

	except requests.exceptions.RequestException as e:
	# layman's terms: network or HTTP error
	return f"_Network error: {e}_"
	except Exception as e:
	# layman's terms: any other unexpected error
	return f"_Unexpected error: {e}_"

	# ----------------------------
	# Gradio UI
	# ----------------------------
	with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
	# layman's terms: a simple, centered header explaining what this tool does
	gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")

	with gr.Row():
	url_box = gr.Textbox(
	label="URL",
	placeholder="example.com or https://example.com/article",
	)
	fetch_btn = gr.Button("Fetch")

	# layman's terms: show the result as rendered Markdown (not a plain textbox)
	output_md = gr.Markdown(label="Readable Markdown")

	# layman's terms: helpful example URLs to try with one click
	gr.Examples(
	examples=[
	["https://en.wikipedia.org/wiki/Hugging_Face"],
	["https://huggingface.co/blog"],
	["https://www.bbc.com/news"],
	],
	inputs=[url_box],
	)

	fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
	url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)