File size: 15,865 Bytes
f890eb5 8e5c5da f890eb5 e7c6d66 f890eb5 e7c6d66 f890eb5 32db98e f890eb5 321422d f890eb5 32db98e f890eb5 321422d f890eb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 |
# File: main/app.py
# Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
# Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
# UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
# Notes: Comments are in layman's terms to explain each section.
import gradio as gr # UI framework for the web app
import requests # HTTP client to fetch web pages
from bs4 import BeautifulSoup # HTML parser to extract tags and text
from readability import Document # Readability algorithm to find main content
from urllib.parse import urljoin, urlparse # Tools to resolve relative/absolute URLs
from dataclasses import dataclass # For neat, typed containers
from typing import List, Dict, Tuple
import re # Regular expressions for cleanup
from datetime import datetime # For formatting dates in metadata safely
# =========================
# Helpers: small data shapes
# =========================
@dataclass
class PageMetadata:
# Simple holder for high-level metadata we care about
title: str = ""
canonical_url: str = ""
description: str = ""
site_name: str = ""
og_type: str = ""
og_url: str = ""
published_time: str = "" # ISO-ish if detected
# =========================
# Network: fetch raw HTML
# =========================
def fetch_html(url: str, timeout: int = 12) -> str:
"""
Downloads the HTML for a given URL using a browser-like User-Agent.
Returns text or raises an HTTP/Request error if something fails.
"""
headers = {
# Pretend to be a modern desktop browser so we don't get blocked
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/127.0.0.0 Safari/537.36"
)
}
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status() # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
return resp.text
# ===================================
# Generic extraction: metadata + text
# ===================================
def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
"""
Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
We check Open Graph and Twitter tags as fallbacks too.
"""
md = PageMetadata()
# Title from <title> or og:title/twitter:title
title_tag = soup.find("title")
md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
# Meta helpers
def meta(name=None, property=None):
if name:
tag = soup.find("meta", attrs={"name": name})
if tag and tag.get("content"):
return tag["content"].strip()
if property:
tag = soup.find("meta", attrs={"property": property})
if tag and tag.get("content"):
return tag["content"].strip()
return ""
# Description (prefer og:description > twitter:description > meta description)
md.description = (
meta(property="og:description")
or meta(name="twitter:description")
or meta(name="description")
or ""
).strip()
# Site name (if available)
md.site_name = (meta(property="og:site_name") or "").strip()
# OpenGraph URL + type (if available)
md.og_url = (meta(property="og:url") or "").strip()
md.og_type = (meta(property="og:type") or "").strip()
# Canonical URL (normalize relative -> absolute)
canon = soup.find("link", rel="canonical")
if canon and canon.get("href"):
md.canonical_url = urljoin(base_url, canon["href"].strip())
else:
# If no canonical, we may fallback to og:url if present
md.canonical_url = md.og_url or base_url
# Try some common publish-time signals
published = (
meta(property="article:published_time")
or meta(name="pubdate")
or meta(name="date")
or ""
).strip()
md.published_time = published
# If no normal <title>, try OG or Twitter titles
if not md.title:
md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
return md
def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
"""
Uses the readability library to find the 'main content' of an article-like page.
Returns a clean text string and a BeautifulSoup of the main content HTML
(so we can also extract links from just the relevant area).
If readability fails/misfires (like index pages), we gracefully fallback to empty text.
"""
try:
doc = Document(html) # Run Readability on the HTML
summary_html = doc.summary() # This is the extracted main-content HTML
# Parse the readability summary into a soup so we can pull out links cleanly
summary_soup = BeautifulSoup(summary_html, "lxml")
# Turn HTML to plain text: keep paragraphs and line breaks readable
# Remove scripts/styles etc. if any slipped through
for tag in summary_soup(["script", "style", "noscript"]):
tag.decompose()
text = summary_soup.get_text("\n", strip=True)
text = re.sub(r"\n{3,}", "\n\n", text) # Collapse superfluous line breaks
return text, summary_soup
except Exception:
# If something goes wrong (e.g., not article-shaped), return empty content
return "", BeautifulSoup("", "lxml")
def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
"""
Finds hyperlinks. If we have a 'main content' soup and the user asked for
content-only links, we grab links from there; otherwise, fall back to the whole page.
We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
"""
anchors = []
if soup and only_content_area:
anchors = soup.find_all("a")
else:
full = BeautifulSoup(fallback_html, "lxml")
anchors = full.find_all("a")
results = []
seen = set()
for a in anchors:
href = (a.get("href") or "").strip()
text = a.get_text(" ", strip=True)
if not href:
continue
# Skip empty, anchors, JS, and non-http links
if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
continue
# Make absolute
absolute = urljoin(base_url, href)
# Deduplicate by absolute URL + link text combo
key = (absolute, text)
if key in seen:
continue
seen.add(key)
domain = urlparse(absolute).netloc
results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
return results
# ====================================
# Special-case: Hacker News front page
# ====================================
def is_hn_front(url: str) -> bool:
"""
Checks if the URL is the Hacker News front page (news.ycombinator.com).
We'll special-handle it for a great experience listing posts.
"""
p = urlparse(url)
if p.netloc != "news.ycombinator.com":
return False
# Treat /, /news, or /front as "front page" style
return p.path in ("", "/", "/news", "/front")
def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
"""
Parses the Hacker News front page HTML to extract ranked items with points and comments.
Returns a Markdown overview and a list-of-dicts suitable for a table.
"""
soup = BeautifulSoup(html, "lxml")
items = []
# Each story is a <tr class="athing">; subtext is in the immediate next <tr>
for story in soup.select("tr.athing"):
# Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
rank_tag = story.select_one("span.rank")
rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")
# Title + URL (HN changed markup: 'span.titleline a' is current)
title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
title = title_a.get_text(strip=True) if title_a else "(no title)"
url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url
# Source domain (e.g., (github.com))
site = story.select_one("span.sitestr")
source = site.get_text(strip=True) if site else urlparse(url).netloc
# Subtext row comes right after the 'athing' row
subtext_row = story.find_next_sibling("tr")
points, comments, age, by = "", "", "", ""
if subtext_row:
# Points like "123 points"
score = subtext_row.select_one("span.score")
points = score.get_text(strip=True) if score else ""
# Byline: "by username"
user_a = subtext_row.select_one("a.hnuser")
by = user_a.get_text(strip=True) if user_a else ""
# Age: "5 hours ago"
age_tag = subtext_row.select_one("span.age")
age = age_tag.get_text(strip=True) if age_tag else ""
# Comments link: last <a> typically ends with "comments" or "discuss"
comment_a = None
links = subtext_row.select("a")
if links:
comment_a = links[-1]
comments = (comment_a.get_text(strip=True) if comment_a else "").lower()
items.append({
"Rank": rank,
"Title": title,
"URL": url,
"Source": source,
"Points": points,
"By": by,
"Age": age,
"Comments": comments,
})
# Build a tight Markdown digest so you can "use" HN inside the tool
md_lines = ["# Hacker News — Front Page",
"",
"Here are the current front-page posts (click to open):",
""]
for it in items:
rank = it["Rank"] or "•"
title = it["Title"]
url = it["URL"]
pts = it["Points"] or ""
cmt = it["Comments"] or ""
age = it["Age"] or ""
src = it["Source"] or ""
# Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
extras = " — ".join(filter(None, [
" ".join(filter(None, [pts, cmt])),
age,
f"({src})"
]))
md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
md = "\n".join(md_lines) if items else "# Hacker News — No items found"
return md, items
# ===========================
# Public function for Gradio
# ===========================
def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
"""
Main function wired to the UI.
- Fetches the page
- If it's Hacker News front page, parse posts specially
- Otherwise: extract metadata, main text (optional), and links
- Returns Markdown (summary) + a table of links
"""
try:
html = fetch_html(url)
except requests.exceptions.RequestException as e:
# Friendly error message for the UI textbox
return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []
# Hacker News special handling for top-notch usability
if is_hn_front(url):
md, items = parse_hn_front(html, url)
return md, items # For HN, the table is the rich story list
# Generic page pipeline
soup_full = BeautifulSoup(html, "lxml") # Full page soup for metadata and optional link fallback
metadata = extract_metadata(soup_full, url) # Title, canonical, description, etc.
main_text, summary_soup = extract_main_text(html) # Readability content (may be empty on index pages)
# Choose where we harvest links from
links = collect_links(summary_soup, url, content_links_only, html)
if max_links and max_links > 0:
links = links[:max_links]
# Build a readable Markdown summary
md_lines = []
# Title line (prefer metadata title)
title_to_show = metadata.title or "(Untitled)"
md_lines.append(f"# {title_to_show}")
# Canonical + URL info
if metadata.canonical_url and metadata.canonical_url != url:
md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
md_lines.append(f"- **URL:** {url}")
# Optional metadata lines
if metadata.site_name:
md_lines.append(f"- **Site:** {metadata.site_name}")
if metadata.description:
md_lines.append(f"- **Description:** {metadata.description}")
if metadata.published_time:
md_lines.append(f"- **Published:** {metadata.published_time}")
if metadata.og_type:
md_lines.append(f"- **OG Type:** {metadata.og_type}")
# Spacer
md_lines.append("\n---\n")
# Main content (optional, controlled by checkbox)
if full_text and main_text:
md_lines.append("## Main Content")
# Keep things readable; long pages can be huge—Readability already helps keep it topical
md_lines.append(main_text)
md_lines.append("\n---\n")
# Links brief (we also return a structured table below)
md_lines.append("## Links Found")
md_lines.append(
f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
"Click any to open in a new tab."
)
md = "\n".join(md_lines)
return md, links
# ===========
# Gradio UI
# ===========
# Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo:
# --- Header area: title + quick helper buttons
gr.Markdown("# Fetch MCP — Clean Text & Links\n"
"Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
"- Special handling for **Hacker News** front page (rank, points, comments).\n"
"- Toggle **Full Text** if you also want the extracted article content.")
with gr.Row():
url_in = gr.Textbox(
label="URL",
placeholder="https://news.ycombinator.com/ • https://example.com/article",
value="https://news.ycombinator.com/",
scale=4
)
fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)
with gr.Row():
full_text_chk = gr.Checkbox(
label="Include main content text (Readability extract)?",
value=False
)
content_only_chk = gr.Checkbox(
label="Links from main content only (fallback: full page)?",
value=True
)
max_links_sld = gr.Slider(
label="Max links to return",
minimum=10, maximum=500, value=100, step=10
)
# Outputs: Markdown summary + a table of links (or HN posts table)
summary_md = gr.Markdown(label="Summary")
links_tbl = gr.Dataframe(
headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
# We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
interactive=False,
wrap=True,
row_count=(0, "dynamic"),
col_count=(0, "dynamic")
)
# Wire up the action: clicking the button runs extract_page and shows results
fetch_btn.click(
fn=extract_page,
inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
outputs=[summary_md, links_tbl]
)
# Keep MCP server behavior enabled for your setup
if __name__ == "__main__":
demo.launch(mcp_server=True)
|