|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
from urllib.parse import urljoin, urldefrag, urlparse |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
def _http_get(url: str) -> requests.Response: |
|
""" |
|
Make an HTTP GET request with headers and a timeout. |
|
Layman's terms: downloads the webpage safely and politely. |
|
""" |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
} |
|
|
|
return requests.get(url, headers=headers, timeout=15) |
|
|
|
|
|
|
|
|
|
|
|
def _normalize_whitespace(text: str) -> str: |
|
""" |
|
Layman's terms: squash weird spacing and too many blank lines. |
|
""" |
|
text = re.sub(r"[ \t\u00A0]+", " ", text) |
|
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) |
|
return text.strip() |
|
|
|
|
|
def _truncate(text: str, max_chars: int) -> tuple[str, bool]: |
|
""" |
|
Layman's terms: cut the text if it’s too long and tell the caller if we cut it. |
|
""" |
|
if max_chars is None or max_chars <= 0 or len(text) <= max_chars: |
|
return text, False |
|
return text[:max_chars].rstrip() + " …", True |
|
|
|
|
|
def _domain_of(url: str) -> str: |
|
""" |
|
Layman's terms: show a friendly domain like example.com. |
|
""" |
|
try: |
|
return urlparse(url).netloc or "" |
|
except Exception: |
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict: |
|
""" |
|
Layman's terms: grab useful fields like title, description, site name, and canonical link. |
|
""" |
|
meta = {} |
|
|
|
|
|
title_candidates = [ |
|
(soup.title.string if soup.title and soup.title.string else None), |
|
_og(soup, "og:title"), |
|
_meta(soup, "twitter:title"), |
|
] |
|
meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "") |
|
|
|
|
|
desc_candidates = [ |
|
_meta(soup, "description"), |
|
_og(soup, "og:description"), |
|
_meta(soup, "twitter:description"), |
|
] |
|
meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "") |
|
|
|
|
|
link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v) |
|
meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else "" |
|
|
|
|
|
meta["site_name"] = (_og(soup, "og:site_name") or "").strip() |
|
|
|
|
|
html_tag = soup.find("html") |
|
meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else "" |
|
|
|
|
|
meta["fetched_url"] = final_url |
|
meta["domain"] = _domain_of(final_url) |
|
|
|
return meta |
|
|
|
|
|
def _meta(soup: BeautifulSoup, name: str) -> str | None: |
|
tag = soup.find("meta", attrs={"name": name}) |
|
return tag.get("content") if tag and tag.has_attr("content") else None |
|
|
|
|
|
def _og(soup: BeautifulSoup, prop: str) -> str | None: |
|
tag = soup.find("meta", attrs={"property": prop}) |
|
return tag.get("content") if tag and tag.has_attr("content") else None |
|
|
|
|
|
|
|
|
|
|
|
def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]: |
|
""" |
|
Layman's terms: use Readability to find the article body, then clean it to plain text. |
|
Returns (clean_text, soup_of_readable_html) for link scraping. |
|
""" |
|
|
|
doc = Document(html) |
|
readable_html = doc.summary(html_partial=True) |
|
|
|
|
|
s = BeautifulSoup(readable_html, "lxml") |
|
|
|
|
|
for sel in ["script", "style", "noscript", "iframe", "svg"]: |
|
for tag in s.select(sel): |
|
tag.decompose() |
|
|
|
|
|
text_parts = [] |
|
for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]): |
|
|
|
chunk = p.get_text(" ", strip=True) |
|
if chunk: |
|
text_parts.append(chunk) |
|
|
|
clean_text = _normalize_whitespace("\n\n".join(text_parts)) |
|
return clean_text, s |
|
|
|
|
|
|
|
|
|
|
|
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]: |
|
""" |
|
Layman's terms: pull out clickable links from the article content only, |
|
turn them into absolute URLs, drop junk, dedupe, and cap the list. |
|
""" |
|
seen = set() |
|
links: list[tuple[str, str]] = [] |
|
|
|
for a in readable_soup.find_all("a", href=True): |
|
href = a.get("href").strip() |
|
|
|
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"): |
|
continue |
|
|
|
|
|
absolute = urljoin(base_url, href) |
|
absolute, _ = urldefrag(absolute) |
|
|
|
if absolute in seen: |
|
continue |
|
seen.add(absolute) |
|
|
|
text = a.get_text(" ", strip=True) |
|
|
|
if len(text) > 120: |
|
text = text[:117] + "…" |
|
|
|
links.append((text or absolute, absolute)) |
|
|
|
if len(links) >= max_links > 0: |
|
break |
|
|
|
return links |
|
|
|
|
|
|
|
|
|
|
|
def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]], |
|
include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str: |
|
""" |
|
Layman's terms: turn the pieces into a neat, compact Markdown string. |
|
""" |
|
lines = [] |
|
|
|
|
|
title = meta.get("title") or meta.get("domain") or "Untitled" |
|
lines.append(f"# {title}") |
|
|
|
|
|
if include_metadata: |
|
md = [] |
|
|
|
if meta.get("description"): |
|
md.append(f"- **Description:** {meta['description']}") |
|
if meta.get("site_name"): |
|
md.append(f"- **Site:** {meta['site_name']}") |
|
if meta.get("canonical"): |
|
md.append(f"- **Canonical:** {meta['canonical']}") |
|
if meta.get("lang"): |
|
md.append(f"- **Language:** {meta['lang']}") |
|
if meta.get("fetched_url"): |
|
md.append(f"- **Fetched From:** {meta['fetched_url']}") |
|
|
|
if md: |
|
lines.append("## Metadata") |
|
lines.extend(md) |
|
|
|
|
|
if include_text and body: |
|
|
|
if verbosity == "Brief": |
|
brief, was_more = _truncate(body, 800) |
|
lines.append("## Text") |
|
lines.append(brief) |
|
if was_more or body_truncated: |
|
lines.append("\n> (Trimmed for brevity)") |
|
else: |
|
lines.append("## Text") |
|
lines.append(body) |
|
if body_truncated: |
|
lines.append("\n> (Trimmed for brevity)") |
|
|
|
|
|
if include_links and links: |
|
lines.append(f"## Links ({len(links)})") |
|
for text, url in links: |
|
lines.append(f"- [{text}]({url})") |
|
|
|
return "\n\n".join(lines).strip() |
|
|
|
|
|
|
|
|
|
|
|
def extract_relevant( |
|
url: str, |
|
verbosity: str = "Standard", |
|
include_metadata: bool = True, |
|
include_text: bool = True, |
|
include_links: bool = True, |
|
max_chars: int = 3000, |
|
max_links: int = 20 |
|
) -> str: |
|
""" |
|
Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary. |
|
""" |
|
if not url or not url.strip(): |
|
return "Please enter a valid URL." |
|
|
|
try: |
|
resp = _http_get(url) |
|
resp.raise_for_status() |
|
except requests.exceptions.RequestException as e: |
|
return f"An error occurred: {e}" |
|
|
|
|
|
final_url = str(resp.url) |
|
|
|
|
|
ctype = resp.headers.get("Content-Type", "") |
|
if "html" not in ctype.lower(): |
|
return f"Unsupported content type for extraction: {ctype or 'unknown'}" |
|
|
|
|
|
resp.encoding = resp.encoding or resp.apparent_encoding |
|
html = resp.text |
|
|
|
|
|
full_soup = BeautifulSoup(html, "lxml") |
|
meta = _extract_metadata(full_soup, final_url) |
|
|
|
|
|
body_text, readable_soup = _extract_main_text(html) |
|
|
|
|
|
if not body_text: |
|
fallback_text = full_soup.get_text(" ", strip=True) |
|
body_text = _normalize_whitespace(fallback_text) |
|
|
|
|
|
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999} |
|
target_cap = preset_caps.get(verbosity, 3000) |
|
|
|
cap = min(max_chars if max_chars > 0 else target_cap, target_cap) |
|
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False) |
|
|
|
|
|
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0) |
|
|
|
|
|
md = _format_markdown( |
|
meta=meta, |
|
body=body_text, |
|
body_truncated=truncated, |
|
links=links, |
|
include_text=include_text, |
|
include_metadata=include_metadata, |
|
include_links=include_links, |
|
verbosity=verbosity |
|
) |
|
return md or "No content could be extracted." |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: |
|
|
|
gr.Markdown("# Fetch MCP — Clean Extract") |
|
gr.Markdown( |
|
"Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. " |
|
"Use Verbosity and caps to keep it tight." |
|
) |
|
|
|
with gr.Row(): |
|
url_in = gr.Textbox(label="URL", placeholder="https://example.com/article") |
|
fetch_btn = gr.Button("Fetch Clean Content") |
|
|
|
with gr.Accordion("Options", open=False): |
|
with gr.Row(): |
|
verbosity = gr.Dropdown( |
|
label="Verbosity", |
|
choices=["Brief", "Standard", "Full"], |
|
value="Standard", |
|
info="Controls how much text you get back." |
|
) |
|
max_chars = gr.Slider( |
|
400, 12000, value=3000, step=100, |
|
label="Max Characters (body text)", |
|
info="Hard cap for body text. Lower = less verbose." |
|
) |
|
max_links = gr.Slider( |
|
0, 100, value=20, step=1, |
|
label="Max Links", |
|
info="Limit how many hyperlinks we include." |
|
) |
|
with gr.Row(): |
|
include_metadata = gr.Checkbox(value=True, label="Include Metadata") |
|
include_text = gr.Checkbox(value=True, label="Include Main Text") |
|
include_links = gr.Checkbox(value=True, label="Include Links") |
|
|
|
|
|
out = gr.Markdown(label="Result") |
|
|
|
|
|
fetch_btn.click( |
|
fn=extract_relevant, |
|
inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links], |
|
outputs=out |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|