|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
from urllib.parse import urljoin, urlparse |
|
from dataclasses import dataclass |
|
from typing import List, Dict, Tuple |
|
import re |
|
from datetime import datetime |
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
class PageMetadata: |
|
|
|
title: str = "" |
|
canonical_url: str = "" |
|
description: str = "" |
|
site_name: str = "" |
|
og_type: str = "" |
|
og_url: str = "" |
|
published_time: str = "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_html(url: str, timeout: int = 12) -> str: |
|
""" |
|
Downloads the HTML for a given URL using a browser-like User-Agent. |
|
Returns text or raises an HTTP/Request error if something fails. |
|
""" |
|
headers = { |
|
|
|
"User-Agent": ( |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/127.0.0.0 Safari/537.36" |
|
) |
|
} |
|
resp = requests.get(url, headers=headers, timeout=timeout) |
|
resp.raise_for_status() |
|
return resp.text |
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata: |
|
""" |
|
Pulls common metadata from <title>, <meta>, and <link rel="canonical">. |
|
We check Open Graph and Twitter tags as fallbacks too. |
|
""" |
|
md = PageMetadata() |
|
|
|
|
|
title_tag = soup.find("title") |
|
md.title = (title_tag.get_text(strip=True) if title_tag else "").strip() |
|
|
|
|
|
def meta(name=None, property=None): |
|
if name: |
|
tag = soup.find("meta", attrs={"name": name}) |
|
if tag and tag.get("content"): |
|
return tag["content"].strip() |
|
if property: |
|
tag = soup.find("meta", attrs={"property": property}) |
|
if tag and tag.get("content"): |
|
return tag["content"].strip() |
|
return "" |
|
|
|
|
|
md.description = ( |
|
meta(property="og:description") |
|
or meta(name="twitter:description") |
|
or meta(name="description") |
|
or "" |
|
).strip() |
|
|
|
|
|
md.site_name = (meta(property="og:site_name") or "").strip() |
|
|
|
|
|
md.og_url = (meta(property="og:url") or "").strip() |
|
md.og_type = (meta(property="og:type") or "").strip() |
|
|
|
|
|
canon = soup.find("link", rel="canonical") |
|
if canon and canon.get("href"): |
|
md.canonical_url = urljoin(base_url, canon["href"].strip()) |
|
else: |
|
|
|
md.canonical_url = md.og_url or base_url |
|
|
|
|
|
published = ( |
|
meta(property="article:published_time") |
|
or meta(name="pubdate") |
|
or meta(name="date") |
|
or "" |
|
).strip() |
|
md.published_time = published |
|
|
|
|
|
if not md.title: |
|
md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip() |
|
|
|
return md |
|
|
|
|
|
def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]: |
|
""" |
|
Uses the readability library to find the 'main content' of an article-like page. |
|
Returns a clean text string and a BeautifulSoup of the main content HTML |
|
(so we can also extract links from just the relevant area). |
|
If readability fails/misfires (like index pages), we gracefully fallback to empty text. |
|
""" |
|
try: |
|
doc = Document(html) |
|
summary_html = doc.summary() |
|
|
|
summary_soup = BeautifulSoup(summary_html, "lxml") |
|
|
|
|
|
for tag in summary_soup(["script", "style", "noscript"]): |
|
tag.decompose() |
|
text = summary_soup.get_text("\n", strip=True) |
|
text = re.sub(r"\n{3,}", "\n\n", text) |
|
return text, summary_soup |
|
except Exception: |
|
|
|
return "", BeautifulSoup("", "lxml") |
|
|
|
|
|
def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]: |
|
""" |
|
Finds hyperlinks. If we have a 'main content' soup and the user asked for |
|
content-only links, we grab links from there; otherwise, fall back to the whole page. |
|
We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:). |
|
""" |
|
anchors = [] |
|
if soup and only_content_area: |
|
anchors = soup.find_all("a") |
|
else: |
|
full = BeautifulSoup(fallback_html, "lxml") |
|
anchors = full.find_all("a") |
|
|
|
results = [] |
|
seen = set() |
|
for a in anchors: |
|
href = (a.get("href") or "").strip() |
|
text = a.get_text(" ", strip=True) |
|
if not href: |
|
continue |
|
|
|
if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"): |
|
continue |
|
|
|
absolute = urljoin(base_url, href) |
|
|
|
key = (absolute, text) |
|
if key in seen: |
|
continue |
|
seen.add(key) |
|
domain = urlparse(absolute).netloc |
|
results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain}) |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_hn_front(url: str) -> bool: |
|
""" |
|
Checks if the URL is the Hacker News front page (news.ycombinator.com). |
|
We'll special-handle it for a great experience listing posts. |
|
""" |
|
p = urlparse(url) |
|
if p.netloc != "news.ycombinator.com": |
|
return False |
|
|
|
return p.path in ("", "/", "/news", "/front") |
|
|
|
|
|
def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]: |
|
""" |
|
Parses the Hacker News front page HTML to extract ranked items with points and comments. |
|
Returns a Markdown overview and a list-of-dicts suitable for a table. |
|
""" |
|
soup = BeautifulSoup(html, "lxml") |
|
items = [] |
|
|
|
|
|
for story in soup.select("tr.athing"): |
|
|
|
rank_tag = story.select_one("span.rank") |
|
rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "") |
|
|
|
|
|
title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink") |
|
title = title_a.get_text(strip=True) if title_a else "(no title)" |
|
url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url |
|
|
|
|
|
site = story.select_one("span.sitestr") |
|
source = site.get_text(strip=True) if site else urlparse(url).netloc |
|
|
|
|
|
subtext_row = story.find_next_sibling("tr") |
|
points, comments, age, by = "", "", "", "" |
|
if subtext_row: |
|
|
|
score = subtext_row.select_one("span.score") |
|
points = score.get_text(strip=True) if score else "" |
|
|
|
user_a = subtext_row.select_one("a.hnuser") |
|
by = user_a.get_text(strip=True) if user_a else "" |
|
|
|
age_tag = subtext_row.select_one("span.age") |
|
age = age_tag.get_text(strip=True) if age_tag else "" |
|
|
|
comment_a = None |
|
links = subtext_row.select("a") |
|
if links: |
|
comment_a = links[-1] |
|
comments = (comment_a.get_text(strip=True) if comment_a else "").lower() |
|
|
|
items.append({ |
|
"Rank": rank, |
|
"Title": title, |
|
"URL": url, |
|
"Source": source, |
|
"Points": points, |
|
"By": by, |
|
"Age": age, |
|
"Comments": comments, |
|
}) |
|
|
|
|
|
md_lines = ["# Hacker News — Front Page", |
|
"", |
|
"Here are the current front-page posts (click to open):", |
|
""] |
|
for it in items: |
|
rank = it["Rank"] or "•" |
|
title = it["Title"] |
|
url = it["URL"] |
|
pts = it["Points"] or "" |
|
cmt = it["Comments"] or "" |
|
age = it["Age"] or "" |
|
src = it["Source"] or "" |
|
|
|
extras = " — ".join(filter(None, [ |
|
" ".join(filter(None, [pts, cmt])), |
|
age, |
|
f"({src})" |
|
])) |
|
md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}") |
|
md = "\n".join(md_lines) if items else "# Hacker News — No items found" |
|
|
|
return md, items |
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]: |
|
""" |
|
Main function wired to the UI. |
|
- Fetches the page |
|
- If it's Hacker News front page, parse posts specially |
|
- Otherwise: extract metadata, main text (optional), and links |
|
- Returns Markdown (summary) + a table of links |
|
""" |
|
try: |
|
html = fetch_html(url) |
|
except requests.exceptions.RequestException as e: |
|
|
|
return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", [] |
|
|
|
|
|
if is_hn_front(url): |
|
md, items = parse_hn_front(html, url) |
|
return md, items |
|
|
|
|
|
soup_full = BeautifulSoup(html, "lxml") |
|
metadata = extract_metadata(soup_full, url) |
|
main_text, summary_soup = extract_main_text(html) |
|
|
|
|
|
links = collect_links(summary_soup, url, content_links_only, html) |
|
if max_links and max_links > 0: |
|
links = links[:max_links] |
|
|
|
|
|
md_lines = [] |
|
|
|
|
|
title_to_show = metadata.title or "(Untitled)" |
|
md_lines.append(f"# {title_to_show}") |
|
|
|
|
|
if metadata.canonical_url and metadata.canonical_url != url: |
|
md_lines.append(f"- **Canonical:** {metadata.canonical_url}") |
|
md_lines.append(f"- **URL:** {url}") |
|
|
|
|
|
if metadata.site_name: |
|
md_lines.append(f"- **Site:** {metadata.site_name}") |
|
if metadata.description: |
|
md_lines.append(f"- **Description:** {metadata.description}") |
|
if metadata.published_time: |
|
md_lines.append(f"- **Published:** {metadata.published_time}") |
|
if metadata.og_type: |
|
md_lines.append(f"- **OG Type:** {metadata.og_type}") |
|
|
|
|
|
md_lines.append("\n---\n") |
|
|
|
|
|
if full_text and main_text: |
|
md_lines.append("## Main Content") |
|
|
|
md_lines.append(main_text) |
|
md_lines.append("\n---\n") |
|
|
|
|
|
md_lines.append("## Links Found") |
|
md_lines.append( |
|
f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). " |
|
"Click any to open in a new tab." |
|
) |
|
|
|
md = "\n".join(md_lines) |
|
return md, links |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo: |
|
|
|
gr.Markdown("# Fetch MCP — Clean Text & Links\n" |
|
"Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n" |
|
"- Special handling for **Hacker News** front page (rank, points, comments).\n" |
|
"- Toggle **Full Text** if you also want the extracted article content.") |
|
|
|
with gr.Row(): |
|
url_in = gr.Textbox( |
|
label="URL", |
|
placeholder="https://news.ycombinator.com/ • https://example.com/article", |
|
value="https://news.ycombinator.com/", |
|
scale=4 |
|
) |
|
fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1) |
|
|
|
with gr.Row(): |
|
full_text_chk = gr.Checkbox( |
|
label="Include main content text (Readability extract)?", |
|
value=False |
|
) |
|
content_only_chk = gr.Checkbox( |
|
label="Links from main content only (fallback: full page)?", |
|
value=True |
|
) |
|
max_links_sld = gr.Slider( |
|
label="Max links to return", |
|
minimum=10, maximum=500, value=100, step=10 |
|
) |
|
|
|
|
|
summary_md = gr.Markdown(label="Summary") |
|
links_tbl = gr.Dataframe( |
|
headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"], |
|
|
|
interactive=False, |
|
wrap=True, |
|
row_count=(0, "dynamic"), |
|
col_count=(0, "dynamic") |
|
) |
|
|
|
|
|
fetch_btn.click( |
|
fn=extract_page, |
|
inputs=[url_in, full_text_chk, max_links_sld, content_only_chk], |
|
outputs=[summary_md, links_tbl] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|