Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Fetch / app.py

Nymbo

Update app.py

60bdd74 verified 6 days ago

raw

history blame

3.81 kB

	# app.py
	# Hugging Face Space: Cleaner web-page fetcher
	# -------------------------------------------------------------
	# Fetches a URL and returns a concise, human-readable snapshot:
	# • Title
	# • Meta description
	# • Main text (readability-extracted)
	# • Hyperlinks (anchor text → absolute URL)
	# -------------------------------------------------------------

	import requests # HTTP client
	from bs4 import BeautifulSoup # HTML parsing
	from readability import Document # Boiler-plate removal
	from urllib.parse import urljoin # Build absolute link URLs
	import gradio as gr # UI framework

	def extract_relevant_text(html: str, base_url: str) -> str:
	"""
	Convert raw HTML into a clean, plain-text summary.
	- html: the page's HTML source
	- base_url: needed for resolving relative <a href="">
	Returns a formatted string ready for display.
	"""
	# 1) Let readability isolate the primary article/content
	doc = Document(html)
	title = doc.short_title()

	summary_html = doc.summary() # cleaned, minimal HTML
	summary_soup = BeautifulSoup(summary_html, "lxml")

	# 2) Grab visible paragraph & list text
	body_parts = [
	tag.get_text(" ", strip=True)
	for tag in summary_soup.find_all(["p", "li"])
	if tag.get_text(strip=True)
	]
	main_text = "\n\n".join(body_parts) or "[No main text extracted]"

	# 3) Extract meta description from the full document
	full_soup = BeautifulSoup(html, "lxml")
	meta_desc = ""
	meta_tag = full_soup.find("meta", attrs={"name": "description"})
	if meta_tag and meta_tag.get("content"):
	meta_desc = meta_tag["content"].strip()
	else: # Fallback to Open Graph description
	og_tag = full_soup.find("meta", attrs={"property": "og:description"})
	if og_tag and og_tag.get("content"):
	meta_desc = og_tag["content"].strip()

	# 4) Build a neat list of hyperlinks (anchor text → absolute URL)
	links = []
	for a in summary_soup.find_all("a", href=True):
	href_abs = urljoin(base_url, a["href"])
	text = a.get_text(" ", strip=True) or "[link]"
	links.append(f"• {text} → {href_abs}")

	# 5) Compose the final plaintext output
	sections = [
	f"Title: {title}",
	f"Description: {meta_desc or '[None]'}",
	f"Body:\n{main_text}",
	"Links:\n" + ("\n".join(links) if links else "[No links]")
	]
	return "\n\n".join(sections)


	def fetch_content(url: str) -> str:
	"""
	Fetch the URL and return a concise summary.
	Includes basic error handling for network issues.
	"""
	try:
	# Friendly user-agent prevents some 403s
	headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status() # 4xx/5xx → exception

	return extract_relevant_text(response.text, url)

	except requests.exceptions.RequestException as err:
	# Any network or HTTP error bubbles up here
	return f"[Error] {err}"


	# -------------------------- Gradio UI --------------------------
	demo = gr.Interface(
	fn=fetch_content,
	inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
	outputs=gr.Textbox(
	label="Clean Page Snapshot",
	interactive=False,
	lines=25, # taller box for readability
	),
	title="Clean Web Snapshot",
	description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme",
	)

	if __name__ == "__main__":
	# Expose as an MCP server so you can chain it with other Spaces
	demo.launch(mcp_server=True)