# File: main/app.py
# Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
# Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
# UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
# Notes: Comments are in layman's terms to explain each section.
import gradio as gr # UI framework for the web app
import requests # HTTP client to fetch web pages
from bs4 import BeautifulSoup # HTML parser to extract tags and text
from readability import Document # Readability algorithm to find main content
from urllib.parse import urljoin, urlparse # Tools to resolve relative/absolute URLs
from dataclasses import dataclass # For neat, typed containers
from typing import List, Dict, Tuple
import re # Regular expressions for cleanup
from datetime import datetime # For formatting dates in metadata safely
# =========================
# Helpers: small data shapes
# =========================
@dataclass
class PageMetadata:
# Simple holder for high-level metadata we care about
title: str = ""
canonical_url: str = ""
description: str = ""
site_name: str = ""
og_type: str = ""
og_url: str = ""
published_time: str = "" # ISO-ish if detected
# =========================
# Network: fetch raw HTML
# =========================
def fetch_html(url: str, timeout: int = 12) -> str:
"""
Downloads the HTML for a given URL using a browser-like User-Agent.
Returns text or raises an HTTP/Request error if something fails.
"""
headers = {
# Pretend to be a modern desktop browser so we don't get blocked
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/127.0.0.0 Safari/537.36"
)
}
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status() # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
return resp.text
# ===================================
# Generic extraction: metadata + text
# ===================================
def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
"""
Pulls common metadata from
, , and .
We check Open Graph and Twitter tags as fallbacks too.
"""
md = PageMetadata()
# Title from or og:title/twitter:title
title_tag = soup.find("title")
md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()
# Meta helpers
def meta(name=None, property=None):
if name:
tag = soup.find("meta", attrs={"name": name})
if tag and tag.get("content"):
return tag["content"].strip()
if property:
tag = soup.find("meta", attrs={"property": property})
if tag and tag.get("content"):
return tag["content"].strip()
return ""
# Description (prefer og:description > twitter:description > meta description)
md.description = (
meta(property="og:description")
or meta(name="twitter:description")
or meta(name="description")
or ""
).strip()
# Site name (if available)
md.site_name = (meta(property="og:site_name") or "").strip()
# OpenGraph URL + type (if available)
md.og_url = (meta(property="og:url") or "").strip()
md.og_type = (meta(property="og:type") or "").strip()
# Canonical URL (normalize relative -> absolute)
canon = soup.find("link", rel="canonical")
if canon and canon.get("href"):
md.canonical_url = urljoin(base_url, canon["href"].strip())
else:
# If no canonical, we may fallback to og:url if present
md.canonical_url = md.og_url or base_url
# Try some common publish-time signals
published = (
meta(property="article:published_time")
or meta(name="pubdate")
or meta(name="date")
or ""
).strip()
md.published_time = published
# If no normal , try OG or Twitter titles
if not md.title:
md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()
return md
def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
"""
Uses the readability library to find the 'main content' of an article-like page.
Returns a clean text string and a BeautifulSoup of the main content HTML
(so we can also extract links from just the relevant area).
If readability fails/misfires (like index pages), we gracefully fallback to empty text.
"""
try:
doc = Document(html) # Run Readability on the HTML
summary_html = doc.summary() # This is the extracted main-content HTML
# Parse the readability summary into a soup so we can pull out links cleanly
summary_soup = BeautifulSoup(summary_html, "lxml")
# Turn HTML to plain text: keep paragraphs and line breaks readable
# Remove scripts/styles etc. if any slipped through
for tag in summary_soup(["script", "style", "noscript"]):
tag.decompose()
text = summary_soup.get_text("\n", strip=True)
text = re.sub(r"\n{3,}", "\n\n", text) # Collapse superfluous line breaks
return text, summary_soup
except Exception:
# If something goes wrong (e.g., not article-shaped), return empty content
return "", BeautifulSoup("", "lxml")
def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
"""
Finds hyperlinks. If we have a 'main content' soup and the user asked for
content-only links, we grab links from there; otherwise, fall back to the whole page.
We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
"""
anchors = []
if soup and only_content_area:
anchors = soup.find_all("a")
else:
full = BeautifulSoup(fallback_html, "lxml")
anchors = full.find_all("a")
results = []
seen = set()
for a in anchors:
href = (a.get("href") or "").strip()
text = a.get_text(" ", strip=True)
if not href:
continue
# Skip empty, anchors, JS, and non-http links
if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
continue
# Make absolute
absolute = urljoin(base_url, href)
# Deduplicate by absolute URL + link text combo
key = (absolute, text)
if key in seen:
continue
seen.add(key)
domain = urlparse(absolute).netloc
results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
return results
# ====================================
# Special-case: Hacker News front page
# ====================================
def is_hn_front(url: str) -> bool:
"""
Checks if the URL is the Hacker News front page (news.ycombinator.com).
We'll special-handle it for a great experience listing posts.
"""
p = urlparse(url)
if p.netloc != "news.ycombinator.com":
return False
# Treat /, /news, or /front as "front page" style
return p.path in ("", "/", "/news", "/front")
def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
"""
Parses the Hacker News front page HTML to extract ranked items with points and comments.
Returns a Markdown overview and a list-of-dicts suitable for a table.
"""
soup = BeautifulSoup(html, "lxml")
items = []
# Each story is a