Fetch / app.py
Nymbo's picture
Update app.py
60bdd74 verified
raw
history blame
3.81 kB
# app.py
# Hugging Face Space: Cleaner web-page fetcher
# -------------------------------------------------------------
# Fetches a URL and returns a concise, human-readable snapshot:
# β€’ Title
# β€’ Meta description
# β€’ Main text (readability-extracted)
# β€’ Hyperlinks (anchor text β†’ absolute URL)
# -------------------------------------------------------------
import requests # HTTP client
from bs4 import BeautifulSoup # HTML parsing
from readability import Document # Boiler-plate removal
from urllib.parse import urljoin # Build absolute link URLs
import gradio as gr # UI framework
def extract_relevant_text(html: str, base_url: str) -> str:
"""
Convert raw HTML into a clean, plain-text summary.
- html: the page's HTML source
- base_url: needed for resolving relative <a href="">
Returns a formatted string ready for display.
"""
# 1) Let readability isolate the primary article/content
doc = Document(html)
title = doc.short_title()
summary_html = doc.summary() # cleaned, minimal HTML
summary_soup = BeautifulSoup(summary_html, "lxml")
# 2) Grab visible paragraph & list text
body_parts = [
tag.get_text(" ", strip=True)
for tag in summary_soup.find_all(["p", "li"])
if tag.get_text(strip=True)
]
main_text = "\n\n".join(body_parts) or "[No main text extracted]"
# 3) Extract meta description from the *full* document
full_soup = BeautifulSoup(html, "lxml")
meta_desc = ""
meta_tag = full_soup.find("meta", attrs={"name": "description"})
if meta_tag and meta_tag.get("content"):
meta_desc = meta_tag["content"].strip()
else: # Fallback to Open Graph description
og_tag = full_soup.find("meta", attrs={"property": "og:description"})
if og_tag and og_tag.get("content"):
meta_desc = og_tag["content"].strip()
# 4) Build a neat list of hyperlinks (anchor text β†’ absolute URL)
links = []
for a in summary_soup.find_all("a", href=True):
href_abs = urljoin(base_url, a["href"])
text = a.get_text(" ", strip=True) or "[link]"
links.append(f"β€’ {text} β†’ {href_abs}")
# 5) Compose the final plaintext output
sections = [
f"Title: {title}",
f"Description: {meta_desc or '[None]'}",
f"Body:\n{main_text}",
"Links:\n" + ("\n".join(links) if links else "[No links]")
]
return "\n\n".join(sections)
def fetch_content(url: str) -> str:
"""
Fetch the URL and return a concise summary.
Includes basic error handling for network issues.
"""
try:
# Friendly user-agent prevents some 403s
headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status() # 4xx/5xx β†’ exception
return extract_relevant_text(response.text, url)
except requests.exceptions.RequestException as err:
# Any network or HTTP error bubbles up here
return f"[Error] {err}"
# -------------------------- Gradio UI --------------------------
demo = gr.Interface(
fn=fetch_content,
inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
outputs=gr.Textbox(
label="Clean Page Snapshot",
interactive=False,
lines=25, # taller box for readability
),
title="Clean Web Snapshot",
description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
)
if __name__ == "__main__":
# Expose as an MCP server so you can chain it with other Spaces
demo.launch(mcp_server=True)