File size: 3,813 Bytes
60bdd74 f890eb5 60bdd74 ac9f3b0 60bdd74 ac9f3b0 60bdd74 ac9f3b0 60bdd74 ac9f3b0 60bdd74 ac9f3b0 60bdd74 ac9f3b0 60bdd74 ac9f3b0 f890eb5 60bdd74 32db98e 321422d 60bdd74 f890eb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# app.py
# Hugging Face Space: Cleaner web-page fetcher
# -------------------------------------------------------------
# Fetches a URL and returns a concise, human-readable snapshot:
# β’ Title
# β’ Meta description
# β’ Main text (readability-extracted)
# β’ Hyperlinks (anchor text β absolute URL)
# -------------------------------------------------------------
import requests # HTTP client
from bs4 import BeautifulSoup # HTML parsing
from readability import Document # Boiler-plate removal
from urllib.parse import urljoin # Build absolute link URLs
import gradio as gr # UI framework
def extract_relevant_text(html: str, base_url: str) -> str:
"""
Convert raw HTML into a clean, plain-text summary.
- html: the page's HTML source
- base_url: needed for resolving relative <a href="">
Returns a formatted string ready for display.
"""
# 1) Let readability isolate the primary article/content
doc = Document(html)
title = doc.short_title()
summary_html = doc.summary() # cleaned, minimal HTML
summary_soup = BeautifulSoup(summary_html, "lxml")
# 2) Grab visible paragraph & list text
body_parts = [
tag.get_text(" ", strip=True)
for tag in summary_soup.find_all(["p", "li"])
if tag.get_text(strip=True)
]
main_text = "\n\n".join(body_parts) or "[No main text extracted]"
# 3) Extract meta description from the *full* document
full_soup = BeautifulSoup(html, "lxml")
meta_desc = ""
meta_tag = full_soup.find("meta", attrs={"name": "description"})
if meta_tag and meta_tag.get("content"):
meta_desc = meta_tag["content"].strip()
else: # Fallback to Open Graph description
og_tag = full_soup.find("meta", attrs={"property": "og:description"})
if og_tag and og_tag.get("content"):
meta_desc = og_tag["content"].strip()
# 4) Build a neat list of hyperlinks (anchor text β absolute URL)
links = []
for a in summary_soup.find_all("a", href=True):
href_abs = urljoin(base_url, a["href"])
text = a.get_text(" ", strip=True) or "[link]"
links.append(f"β’ {text} β {href_abs}")
# 5) Compose the final plaintext output
sections = [
f"Title: {title}",
f"Description: {meta_desc or '[None]'}",
f"Body:\n{main_text}",
"Links:\n" + ("\n".join(links) if links else "[No links]")
]
return "\n\n".join(sections)
def fetch_content(url: str) -> str:
"""
Fetch the URL and return a concise summary.
Includes basic error handling for network issues.
"""
try:
# Friendly user-agent prevents some 403s
headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status() # 4xx/5xx β exception
return extract_relevant_text(response.text, url)
except requests.exceptions.RequestException as err:
# Any network or HTTP error bubbles up here
return f"[Error] {err}"
# -------------------------- Gradio UI --------------------------
demo = gr.Interface(
fn=fetch_content,
inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
outputs=gr.Textbox(
label="Clean Page Snapshot",
interactive=False,
lines=25, # taller box for readability
),
title="Clean Web Snapshot",
description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
)
if __name__ == "__main__":
# Expose as an MCP server so you can chain it with other Spaces
demo.launch(mcp_server=True)
|