File size: 13,098 Bytes
b708e4c f890eb5 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c ac9f3b0 b708e4c f890eb5 b708e4c 32db98e b708e4c 321422d f890eb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 |
# File: main/app.py
# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
# instead of returning full HTML. Output is compact and configurable to reduce verbosity.
import gradio as gr # UI library
import requests # HTTP client
from bs4 import BeautifulSoup # HTML parsing
from readability import Document # Readability algorithm to isolate main content
from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
import re # For whitespace cleanup and simple formatting
# -------------------------------
# HTTP fetching with sane defaults
# -------------------------------
def _http_get(url: str) -> requests.Response:
"""
Make an HTTP GET request with headers and a timeout.
Layman's terms: downloads the webpage safely and politely.
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# Short timeouts so the app isn't stuck forever
return requests.get(url, headers=headers, timeout=15)
# ----------------------------------------
# Helpers: text cleanup & friendly trimming
# ----------------------------------------
def _normalize_whitespace(text: str) -> str:
"""
Layman's terms: squash weird spacing and too many blank lines.
"""
text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time
return text.strip()
def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
"""
Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
"""
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
return text, False
return text[:max_chars].rstrip() + " …", True
def _domain_of(url: str) -> str:
"""
Layman's terms: show a friendly domain like example.com.
"""
try:
return urlparse(url).netloc or ""
except Exception:
return ""
# -----------------------------------
# Metadata extraction (title, etc.)
# -----------------------------------
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
"""
Layman's terms: grab useful fields like title, description, site name, and canonical link.
"""
meta = {}
# Title preference: <title> > og:title > twitter:title
title_candidates = [
(soup.title.string if soup.title and soup.title.string else None),
_og(soup, "og:title"),
_meta(soup, "twitter:title"),
]
meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
# Description preference: meta[name=description] > og:description > twitter:description
desc_candidates = [
_meta(soup, "description"),
_og(soup, "og:description"),
_meta(soup, "twitter:description"),
]
meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
# Canonical URL if provided (helps dedupe / standardize)
link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
# Site name (nice for context)
meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
# Language (if present)
html_tag = soup.find("html")
meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
# Final resolved URL and domain
meta["fetched_url"] = final_url
meta["domain"] = _domain_of(final_url)
return meta
def _meta(soup: BeautifulSoup, name: str) -> str | None:
tag = soup.find("meta", attrs={"name": name})
return tag.get("content") if tag and tag.has_attr("content") else None
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", attrs={"property": prop})
return tag.get("content") if tag and tag.has_attr("content") else None
# ---------------------------------------------------------
# Main content extraction with Readability + gentle cleanup
# ---------------------------------------------------------
def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
"""
Layman's terms: use Readability to find the article body, then clean it to plain text.
Returns (clean_text, soup_of_readable_html) for link scraping.
"""
# Readability gives us a simplified article HTML
doc = Document(html)
readable_html = doc.summary(html_partial=True)
# Parse the simplified HTML so we can clean it up further
s = BeautifulSoup(readable_html, "lxml")
# Remove obviously noisy elements if present
for sel in ["script", "style", "noscript", "iframe", "svg"]:
for tag in s.select(sel):
tag.decompose()
# Extract text with paragraphs preserved, then normalize whitespace
text_parts = []
for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
# Keep list items and headers to retain structure without being too verbose
chunk = p.get_text(" ", strip=True)
if chunk:
text_parts.append(chunk)
clean_text = _normalize_whitespace("\n\n".join(text_parts))
return clean_text, s
# ------------------------------------------
# Link extraction from the simplified content
# ------------------------------------------
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
"""
Layman's terms: pull out clickable links from the article content only,
turn them into absolute URLs, drop junk, dedupe, and cap the list.
"""
seen = set()
links: list[tuple[str, str]] = []
for a in readable_soup.find_all("a", href=True):
href = a.get("href").strip()
# Ignore anchors, mailto, javascript, and empty
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
continue
# Resolve relative URLs and strip URL fragments (#section)
absolute = urljoin(base_url, href)
absolute, _ = urldefrag(absolute)
if absolute in seen:
continue
seen.add(absolute)
text = a.get_text(" ", strip=True)
# Keep link text concise
if len(text) > 120:
text = text[:117] + "…"
links.append((text or absolute, absolute))
if len(links) >= max_links > 0:
break
return links
# -------------------------
# Formatter: compact output
# -------------------------
def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
"""
Layman's terms: turn the pieces into a neat, compact Markdown string.
"""
lines = []
# Title header
title = meta.get("title") or meta.get("domain") or "Untitled"
lines.append(f"# {title}")
# Metadata (compact)
if include_metadata:
md = []
# Only show fields that exist to keep things tight
if meta.get("description"):
md.append(f"- **Description:** {meta['description']}")
if meta.get("site_name"):
md.append(f"- **Site:** {meta['site_name']}")
if meta.get("canonical"):
md.append(f"- **Canonical:** {meta['canonical']}")
if meta.get("lang"):
md.append(f"- **Language:** {meta['lang']}")
if meta.get("fetched_url"):
md.append(f"- **Fetched From:** {meta['fetched_url']}")
if md:
lines.append("## Metadata")
lines.extend(md)
# Body text
if include_text and body:
# For "Brief", show a very small excerpt even after truncation
if verbosity == "Brief":
brief, was_more = _truncate(body, 800)
lines.append("## Text")
lines.append(brief)
if was_more or body_truncated:
lines.append("\n> (Trimmed for brevity)")
else:
lines.append("## Text")
lines.append(body)
if body_truncated:
lines.append("\n> (Trimmed for brevity)")
# Links
if include_links and links:
lines.append(f"## Links ({len(links)})")
for text, url in links:
lines.append(f"- [{text}]({url})")
return "\n\n".join(lines).strip()
# --------------------------------
# Gradio-facing function (the app)
# --------------------------------
def extract_relevant(
url: str,
verbosity: str = "Standard",
include_metadata: bool = True,
include_text: bool = True,
include_links: bool = True,
max_chars: int = 3000,
max_links: int = 20
) -> str:
"""
Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
"""
if not url or not url.strip():
return "Please enter a valid URL."
try:
resp = _http_get(url)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
return f"An error occurred: {e}"
# Respect the final resolved URL (after redirects)
final_url = str(resp.url)
# Only process HTML-ish responses
ctype = resp.headers.get("Content-Type", "")
if "html" not in ctype.lower():
return f"Unsupported content type for extraction: {ctype or 'unknown'}"
# Decode as text (requests usually sets encoding; otherwise guess)
resp.encoding = resp.encoding or resp.apparent_encoding
html = resp.text
# Full page soup (to extract metadata accurately)
full_soup = BeautifulSoup(html, "lxml")
meta = _extract_metadata(full_soup, final_url)
# Extract main body text using Readability
body_text, readable_soup = _extract_main_text(html)
# If the body is suspiciously empty, fall back to a simpler text strategy
if not body_text:
fallback_text = full_soup.get_text(" ", strip=True)
body_text = _normalize_whitespace(fallback_text)
# Enforce verbosity presets unless user overrides via slider
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
target_cap = preset_caps.get(verbosity, 3000)
# Use the *smaller* of user cap and preset to keep things tidy
cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
# Extract links from the readable portion only (cleaner than whole DOM)
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
# Build compact Markdown
md = _format_markdown(
meta=meta,
body=body_text,
body_truncated=truncated,
links=links,
include_text=include_text,
include_metadata=include_metadata,
include_links=include_links,
verbosity=verbosity
)
return md or "No content could be extracted."
# -----------------
# Gradio UI (Blocks)
# -----------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
# Title & subtitle for clarity
gr.Markdown("# Fetch MCP — Clean Extract")
gr.Markdown(
"Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
"Use Verbosity and caps to keep it tight."
)
with gr.Row():
url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
fetch_btn = gr.Button("Fetch Clean Content")
with gr.Accordion("Options", open=False):
with gr.Row():
verbosity = gr.Dropdown(
label="Verbosity",
choices=["Brief", "Standard", "Full"],
value="Standard",
info="Controls how much text you get back."
)
max_chars = gr.Slider(
400, 12000, value=3000, step=100,
label="Max Characters (body text)",
info="Hard cap for body text. Lower = less verbose."
)
max_links = gr.Slider(
0, 100, value=20, step=1,
label="Max Links",
info="Limit how many hyperlinks we include."
)
with gr.Row():
include_metadata = gr.Checkbox(value=True, label="Include Metadata")
include_text = gr.Checkbox(value=True, label="Include Main Text")
include_links = gr.Checkbox(value=True, label="Include Links")
# Output as Markdown (compact and readable)
out = gr.Markdown(label="Result")
# Wire up the click
fetch_btn.click(
fn=extract_relevant,
inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
outputs=out
)
# Keep MCP server enabled
if __name__ == "__main__":
demo.launch(mcp_server=True)
|