# File: main/app.py # Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks. # Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments). # UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage. # Notes: Comments are in layman's terms to explain each section. import gradio as gr # UI framework for the web app import requests # HTTP client to fetch web pages from bs4 import BeautifulSoup # HTML parser to extract tags and text from readability import Document # Readability algorithm to find main content from urllib.parse import urljoin, urlparse # Tools to resolve relative/absolute URLs from dataclasses import dataclass # For neat, typed containers from typing import List, Dict, Tuple import re # Regular expressions for cleanup from datetime import datetime # For formatting dates in metadata safely # ========================= # Helpers: small data shapes # ========================= @dataclass class PageMetadata: # Simple holder for high-level metadata we care about title: str = "" canonical_url: str = "" description: str = "" site_name: str = "" og_type: str = "" og_url: str = "" published_time: str = "" # ISO-ish if detected # ========================= # Network: fetch raw HTML # ========================= def fetch_html(url: str, timeout: int = 12) -> str: """ Downloads the HTML for a given URL using a browser-like User-Agent. Returns text or raises an HTTP/Request error if something fails. """ headers = { # Pretend to be a modern desktop browser so we don't get blocked "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/127.0.0.0 Safari/537.36" ) } resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn return resp.text # =================================== # Generic extraction: metadata + text # =================================== def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata: """ Pulls common metadata from , <meta>, and <link rel="canonical">. We check Open Graph and Twitter tags as fallbacks too. """ md = PageMetadata() # Title from <title> or og:title/twitter:title title_tag = soup.find("title") md.title = (title_tag.get_text(strip=True) if title_tag else "").strip() # Meta helpers def meta(name=None, property=None): if name: tag = soup.find("meta", attrs={"name": name}) if tag and tag.get("content"): return tag["content"].strip() if property: tag = soup.find("meta", attrs={"property": property}) if tag and tag.get("content"): return tag["content"].strip() return "" # Description (prefer og:description > twitter:description > meta description) md.description = ( meta(property="og:description") or meta(name="twitter:description") or meta(name="description") or "" ).strip() # Site name (if available) md.site_name = (meta(property="og:site_name") or "").strip() # OpenGraph URL + type (if available) md.og_url = (meta(property="og:url") or "").strip() md.og_type = (meta(property="og:type") or "").strip() # Canonical URL (normalize relative -> absolute) canon = soup.find("link", rel="canonical") if canon and canon.get("href"): md.canonical_url = urljoin(base_url, canon["href"].strip()) else: # If no canonical, we may fallback to og:url if present md.canonical_url = md.og_url or base_url # Try some common publish-time signals published = ( meta(property="article:published_time") or meta(name="pubdate") or meta(name="date") or "" ).strip() md.published_time = published # If no normal <title>, try OG or Twitter titles if not md.title: md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip() return md def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]: """ Uses the readability library to find the 'main content' of an article-like page. Returns a clean text string and a BeautifulSoup of the main content HTML (so we can also extract links from just the relevant area). If readability fails/misfires (like index pages), we gracefully fallback to empty text. """ try: doc = Document(html) # Run Readability on the HTML summary_html = doc.summary() # This is the extracted main-content HTML # Parse the readability summary into a soup so we can pull out links cleanly summary_soup = BeautifulSoup(summary_html, "lxml") # Turn HTML to plain text: keep paragraphs and line breaks readable # Remove scripts/styles etc. if any slipped through for tag in summary_soup(["script", "style", "noscript"]): tag.decompose() text = summary_soup.get_text("\n", strip=True) text = re.sub(r"\n{3,}", "\n\n", text) # Collapse superfluous line breaks return text, summary_soup except Exception: # If something goes wrong (e.g., not article-shaped), return empty content return "", BeautifulSoup("", "lxml") def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]: """ Finds hyperlinks. If we have a 'main content' soup and the user asked for content-only links, we grab links from there; otherwise, fall back to the whole page. We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:). """ anchors = [] if soup and only_content_area: anchors = soup.find_all("a") else: full = BeautifulSoup(fallback_html, "lxml") anchors = full.find_all("a") results = [] seen = set() for a in anchors: href = (a.get("href") or "").strip() text = a.get_text(" ", strip=True) if not href: continue # Skip empty, anchors, JS, and non-http links if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"): continue # Make absolute absolute = urljoin(base_url, href) # Deduplicate by absolute URL + link text combo key = (absolute, text) if key in seen: continue seen.add(key) domain = urlparse(absolute).netloc results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain}) return results # ==================================== # Special-case: Hacker News front page # ==================================== def is_hn_front(url: str) -> bool: """ Checks if the URL is the Hacker News front page (news.ycombinator.com). We'll special-handle it for a great experience listing posts. """ p = urlparse(url) if p.netloc != "news.ycombinator.com": return False # Treat /, /news, or /front as "front page" style return p.path in ("", "/", "/news", "/front") def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]: """ Parses the Hacker News front page HTML to extract ranked items with points and comments. Returns a Markdown overview and a list-of-dicts suitable for a table. """ soup = BeautifulSoup(html, "lxml") items = [] # Each story is a <tr class="athing">; subtext is in the immediate next <tr> for story in soup.select("tr.athing"): # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside rank_tag = story.select_one("span.rank") rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "") # Title + URL (HN changed markup: 'span.titleline a' is current) title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink") title = title_a.get_text(strip=True) if title_a else "(no title)" url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url # Source domain (e.g., (github.com)) site = story.select_one("span.sitestr") source = site.get_text(strip=True) if site else urlparse(url).netloc # Subtext row comes right after the 'athing' row subtext_row = story.find_next_sibling("tr") points, comments, age, by = "", "", "", "" if subtext_row: # Points like "123 points" score = subtext_row.select_one("span.score") points = score.get_text(strip=True) if score else "" # Byline: "by username" user_a = subtext_row.select_one("a.hnuser") by = user_a.get_text(strip=True) if user_a else "" # Age: "5 hours ago" age_tag = subtext_row.select_one("span.age") age = age_tag.get_text(strip=True) if age_tag else "" # Comments link: last <a> typically ends with "comments" or "discuss" comment_a = None links = subtext_row.select("a") if links: comment_a = links[-1] comments = (comment_a.get_text(strip=True) if comment_a else "").lower() items.append({ "Rank": rank, "Title": title, "URL": url, "Source": source, "Points": points, "By": by, "Age": age, "Comments": comments, }) # Build a tight Markdown digest so you can "use" HN inside the tool md_lines = ["# Hacker News — Front Page", "", "Here are the current front-page posts (click to open):", ""] for it in items: rank = it["Rank"] or "•" title = it["Title"] url = it["URL"] pts = it["Points"] or "" cmt = it["Comments"] or "" age = it["Age"] or "" src = it["Source"] or "" # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)" extras = " — ".join(filter(None, [ " ".join(filter(None, [pts, cmt])), age, f"({src})" ])) md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}") md = "\n".join(md_lines) if items else "# Hacker News — No items found" return md, items # =========================== # Public function for Gradio # =========================== def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]: """ Main function wired to the UI. - Fetches the page - If it's Hacker News front page, parse posts specially - Otherwise: extract metadata, main text (optional), and links - Returns Markdown (summary) + a table of links """ try: html = fetch_html(url) except requests.exceptions.RequestException as e: # Friendly error message for the UI textbox return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", [] # Hacker News special handling for top-notch usability if is_hn_front(url): md, items = parse_hn_front(html, url) return md, items # For HN, the table is the rich story list # Generic page pipeline soup_full = BeautifulSoup(html, "lxml") # Full page soup for metadata and optional link fallback metadata = extract_metadata(soup_full, url) # Title, canonical, description, etc. main_text, summary_soup = extract_main_text(html) # Readability content (may be empty on index pages) # Choose where we harvest links from links = collect_links(summary_soup, url, content_links_only, html) if max_links and max_links > 0: links = links[:max_links] # Build a readable Markdown summary md_lines = [] # Title line (prefer metadata title) title_to_show = metadata.title or "(Untitled)" md_lines.append(f"# {title_to_show}") # Canonical + URL info if metadata.canonical_url and metadata.canonical_url != url: md_lines.append(f"- **Canonical:** {metadata.canonical_url}") md_lines.append(f"- **URL:** {url}") # Optional metadata lines if metadata.site_name: md_lines.append(f"- **Site:** {metadata.site_name}") if metadata.description: md_lines.append(f"- **Description:** {metadata.description}") if metadata.published_time: md_lines.append(f"- **Published:** {metadata.published_time}") if metadata.og_type: md_lines.append(f"- **OG Type:** {metadata.og_type}") # Spacer md_lines.append("\n---\n") # Main content (optional, controlled by checkbox) if full_text and main_text: md_lines.append("## Main Content") # Keep things readable; long pages can be huge—Readability already helps keep it topical md_lines.append(main_text) md_lines.append("\n---\n") # Links brief (we also return a structured table below) md_lines.append("## Links Found") md_lines.append( f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). " "Click any to open in a new tab." ) md = "\n".join(md_lines) return md, links # =========== # Gradio UI # =========== # Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo: # --- Header area: title + quick helper buttons gr.Markdown("# Fetch MCP — Clean Text & Links\n" "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n" "- Special handling for **Hacker News** front page (rank, points, comments).\n" "- Toggle **Full Text** if you also want the extracted article content.") with gr.Row(): url_in = gr.Textbox( label="URL", placeholder="https://news.ycombinator.com/ • https://example.com/article", value="https://news.ycombinator.com/", scale=4 ) fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1) with gr.Row(): full_text_chk = gr.Checkbox( label="Include main content text (Readability extract)?", value=False ) content_only_chk = gr.Checkbox( label="Links from main content only (fallback: full page)?", value=True ) max_links_sld = gr.Slider( label="Max links to return", minimum=10, maximum=500, value=100, step=10 ) # Outputs: Markdown summary + a table of links (or HN posts table) summary_md = gr.Markdown(label="Summary") links_tbl = gr.Dataframe( headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"], # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided. interactive=False, wrap=True, row_count=(0, "dynamic"), col_count=(0, "dynamic") ) # Wire up the action: clicking the button runs extract_page and shows results fetch_btn.click( fn=extract_page, inputs=[url_in, full_text_chk, max_links_sld, content_only_chk], outputs=[summary_md, links_tbl] ) # Keep MCP server behavior enabled for your setup if __name__ == "__main__": demo.launch(mcp_server=True)