Fetch / app.py
Nymbo's picture
Update app.py
18bb794 verified
# File: main/app.py
# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
# instead of returning full HTML. Output is compact and configurable to reduce verbosity.
import gradio as gr # UI library
import requests # HTTP client
from bs4 import BeautifulSoup # HTML parsing
from readability import Document # Readability algorithm to isolate main content
from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
import re # For whitespace cleanup and simple formatting
# -------------------------------
# HTTP fetching with sane defaults
# -------------------------------
def _http_get(url: str) -> requests.Response:
"""
Make an HTTP GET request with headers and a timeout.
Layman's terms: downloads the webpage safely and politely.
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# Short timeouts so the app isn't stuck forever
return requests.get(url, headers=headers, timeout=15)
# ----------------------------------------
# Helpers: text cleanup & friendly trimming
# ----------------------------------------
def _normalize_whitespace(text: str) -> str:
"""
Layman's terms: squash weird spacing and too many blank lines.
"""
text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time
return text.strip()
def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
"""
Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
"""
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
return text, False
return text[:max_chars].rstrip() + " …", True
def _domain_of(url: str) -> str:
"""
Layman's terms: show a friendly domain like example.com.
"""
try:
return urlparse(url).netloc or ""
except Exception:
return ""
# -----------------------------------
# Metadata extraction (title, etc.)
# -----------------------------------
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
"""
Layman's terms: grab useful fields like title, description, site name, and canonical link.
"""
meta = {}
# Title preference: <title> > og:title > twitter:title
title_candidates = [
(soup.title.string if soup.title and soup.title.string else None),
_og(soup, "og:title"),
_meta(soup, "twitter:title"),
]
meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
# Description preference: meta[name=description] > og:description > twitter:description
desc_candidates = [
_meta(soup, "description"),
_og(soup, "og:description"),
_meta(soup, "twitter:description"),
]
meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
# Canonical URL if provided (helps dedupe / standardize)
link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
# Site name (nice for context)
meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
# Language (if present)
html_tag = soup.find("html")
meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
# Final resolved URL and domain
meta["fetched_url"] = final_url
meta["domain"] = _domain_of(final_url)
return meta
def _meta(soup: BeautifulSoup, name: str) -> str | None:
tag = soup.find("meta", attrs={"name": name})
return tag.get("content") if tag and tag.has_attr("content") else None
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", attrs={"property": prop})
return tag.get("content") if tag and tag.has_attr("content") else None
# ---------------------------------------------------------
# Main content extraction with Readability + gentle cleanup
# ---------------------------------------------------------
def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
"""
Layman's terms: use Readability to find the article body, then clean it to plain text.
Returns (clean_text, soup_of_readable_html) for link scraping.
"""
# Readability gives us a simplified article HTML
doc = Document(html)
readable_html = doc.summary(html_partial=True)
# Parse the simplified HTML so we can clean it up further
s = BeautifulSoup(readable_html, "lxml")
# Remove obviously noisy elements if present
for sel in ["script", "style", "noscript", "iframe", "svg"]:
for tag in s.select(sel):
tag.decompose()
# Extract text with paragraphs preserved, then normalize whitespace
text_parts = []
for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
# Keep list items and headers to retain structure without being too verbose
chunk = p.get_text(" ", strip=True)
if chunk:
text_parts.append(chunk)
clean_text = _normalize_whitespace("\n\n".join(text_parts))
return clean_text, s
# ------------------------------------------
# Link extraction from the simplified content
# ------------------------------------------
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
"""
Layman's terms: pull out clickable links from the article content only,
turn them into absolute URLs, drop junk, dedupe, and cap the list.
"""
seen = set()
links: list[tuple[str, str]] = []
for a in readable_soup.find_all("a", href=True):
href = a.get("href").strip()
# Ignore anchors, mailto, javascript, and empty
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
continue
# Resolve relative URLs and strip URL fragments (#section)
absolute = urljoin(base_url, href)
absolute, _ = urldefrag(absolute)
if absolute in seen:
continue
seen.add(absolute)
text = a.get_text(" ", strip=True)
# Keep link text concise
if len(text) > 120:
text = text[:117] + "…"
links.append((text or absolute, absolute))
if len(links) >= max_links > 0:
break
return links
# -------------------------
# Formatter: compact output
# -------------------------
def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
"""
Layman's terms: turn the pieces into a neat, compact Markdown string.
"""
lines = []
# Title header
title = meta.get("title") or meta.get("domain") or "Untitled"
lines.append(f"# {title}")
# Metadata (compact)
if include_metadata:
md = []
# Only show fields that exist to keep things tight
if meta.get("description"):
md.append(f"- **Description:** {meta['description']}")
if meta.get("site_name"):
md.append(f"- **Site:** {meta['site_name']}")
if meta.get("canonical"):
md.append(f"- **Canonical:** {meta['canonical']}")
if meta.get("lang"):
md.append(f"- **Language:** {meta['lang']}")
if meta.get("fetched_url"):
md.append(f"- **Fetched From:** {meta['fetched_url']}")
if md:
lines.append("## Metadata")
lines.extend(md)
# Body text
if include_text and body:
# For "Brief", show a very small excerpt even after truncation
if verbosity == "Brief":
brief, was_more = _truncate(body, 800)
lines.append("## Text")
lines.append(brief)
if was_more or body_truncated:
lines.append("\n> (Trimmed for brevity)")
else:
lines.append("## Text")
lines.append(body)
if body_truncated:
lines.append("\n> (Trimmed for brevity)")
# Links
if include_links and links:
lines.append(f"## Links ({len(links)})")
for text, url in links:
lines.append(f"- [{text}]({url})")
return "\n\n".join(lines).strip()
# --------------------------------
# Gradio-facing function (the app)
# --------------------------------
def extract_relevant(
url: str,
verbosity: str = "Standard",
include_metadata: bool = True,
include_text: bool = True,
include_links: bool = True,
max_chars: int = 3000,
max_links: int = 20
) -> str:
"""
Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
"""
if not url or not url.strip():
return "Please enter a valid URL."
try:
resp = _http_get(url)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
return f"An error occurred: {e}"
# Respect the final resolved URL (after redirects)
final_url = str(resp.url)
# Only process HTML-ish responses
ctype = resp.headers.get("Content-Type", "")
if "html" not in ctype.lower():
return f"Unsupported content type for extraction: {ctype or 'unknown'}"
# Decode as text (requests usually sets encoding; otherwise guess)
resp.encoding = resp.encoding or resp.apparent_encoding
html = resp.text
# Full page soup (to extract metadata accurately)
full_soup = BeautifulSoup(html, "lxml")
meta = _extract_metadata(full_soup, final_url)
# Extract main body text using Readability
body_text, readable_soup = _extract_main_text(html)
# If the body is suspiciously empty, fall back to a simpler text strategy
if not body_text:
fallback_text = full_soup.get_text(" ", strip=True)
body_text = _normalize_whitespace(fallback_text)
# Enforce verbosity presets unless user overrides via slider
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
target_cap = preset_caps.get(verbosity, 3000)
# Use the *smaller* of user cap and preset to keep things tidy
cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
# Extract links from the readable portion only (cleaner than whole DOM)
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
# Build compact Markdown
md = _format_markdown(
meta=meta,
body=body_text,
body_truncated=truncated,
links=links,
include_text=include_text,
include_metadata=include_metadata,
include_links=include_links,
verbosity=verbosity
)
return md or "No content could be extracted."
# -----------------
# Gradio UI (Blocks)
# -----------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
# Title & subtitle for clarity
gr.Markdown("# Fetch MCP — Clean Extract")
gr.Markdown(
"Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
"Use Verbosity and caps to keep it tight."
)
with gr.Row():
url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
fetch_btn = gr.Button("Fetch Clean Content")
with gr.Accordion("Options", open=False):
with gr.Row():
verbosity = gr.Dropdown(
label="Verbosity",
choices=["Brief", "Standard", "Full"],
value="Standard",
info="Controls how much text you get back."
)
max_chars = gr.Slider(
400, 12000, value=3000, step=100,
label="Max Characters (body text)",
info="Hard cap for body text. Lower = less verbose."
)
max_links = gr.Slider(
0, 100, value=20, step=1,
label="Max Links",
info="Limit how many hyperlinks we include."
)
with gr.Row():
include_metadata = gr.Checkbox(value=True, label="Include Metadata")
include_text = gr.Checkbox(value=True, label="Include Main Text")
include_links = gr.Checkbox(value=True, label="Include Links")
# Output as Markdown (compact and readable)
out = gr.Markdown(label="Result")
# Wire up the click
fetch_btn.click(
fn=extract_relevant,
inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
outputs=out
)
# Keep MCP server enabled
if __name__ == "__main__":
demo.launch(mcp_server=True)