File size: 15,865 Bytes
f890eb5
 
 
 
 
8e5c5da
f890eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c6d66
f890eb5
 
 
 
e7c6d66
 
f890eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
f890eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321422d
f890eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
f890eb5
321422d
f890eb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# File: main/app.py
# Purpose: Fetch a URL and return only relevant text (title, metadata, clean main text) and hyperlinks.
# Bonus: Special handling for Hacker News front page to list posts (rank, title, points, comments).
# UI: Gradio Blocks with Markdown + DataFrame outputs, suitable for MCP usage.
# Notes: Comments are in layman's terms to explain each section.

import gradio as gr                    # UI framework for the web app
import requests                        # HTTP client to fetch web pages
from bs4 import BeautifulSoup          # HTML parser to extract tags and text
from readability import Document       # Readability algorithm to find main content
from urllib.parse import urljoin, urlparse  # Tools to resolve relative/absolute URLs
from dataclasses import dataclass       # For neat, typed containers
from typing import List, Dict, Tuple
import re                               # Regular expressions for cleanup
from datetime import datetime           # For formatting dates in metadata safely


# =========================
# Helpers: small data shapes
# =========================

@dataclass
class PageMetadata:
    # Simple holder for high-level metadata we care about
    title: str = ""
    canonical_url: str = ""
    description: str = ""
    site_name: str = ""
    og_type: str = ""
    og_url: str = ""
    published_time: str = ""  # ISO-ish if detected


# =========================
# Network: fetch raw HTML
# =========================

def fetch_html(url: str, timeout: int = 12) -> str:
    """
    Downloads the HTML for a given URL using a browser-like User-Agent.
    Returns text or raises an HTTP/Request error if something fails.
    """
    headers = {
        # Pretend to be a modern desktop browser so we don't get blocked
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/127.0.0.0 Safari/537.36"
        )
    }
    resp = requests.get(url, headers=headers, timeout=timeout)
    resp.raise_for_status()  # If it's 4xx/5xx, this throws; we catch it above in the Gradio fn
    return resp.text


# ===================================
# Generic extraction: metadata + text
# ===================================

def extract_metadata(soup: BeautifulSoup, base_url: str) -> PageMetadata:
    """
    Pulls common metadata from <title>, <meta>, and <link rel="canonical">.
    We check Open Graph and Twitter tags as fallbacks too.
    """
    md = PageMetadata()

    # Title from <title> or og:title/twitter:title
    title_tag = soup.find("title")
    md.title = (title_tag.get_text(strip=True) if title_tag else "").strip()

    # Meta helpers
    def meta(name=None, property=None):
        if name:
            tag = soup.find("meta", attrs={"name": name})
            if tag and tag.get("content"):
                return tag["content"].strip()
        if property:
            tag = soup.find("meta", attrs={"property": property})
            if tag and tag.get("content"):
                return tag["content"].strip()
        return ""

    # Description (prefer og:description > twitter:description > meta description)
    md.description = (
        meta(property="og:description")
        or meta(name="twitter:description")
        or meta(name="description")
        or ""
    ).strip()

    # Site name (if available)
    md.site_name = (meta(property="og:site_name") or "").strip()

    # OpenGraph URL + type (if available)
    md.og_url = (meta(property="og:url") or "").strip()
    md.og_type = (meta(property="og:type") or "").strip()

    # Canonical URL (normalize relative -> absolute)
    canon = soup.find("link", rel="canonical")
    if canon and canon.get("href"):
        md.canonical_url = urljoin(base_url, canon["href"].strip())
    else:
        # If no canonical, we may fallback to og:url if present
        md.canonical_url = md.og_url or base_url

    # Try some common publish-time signals
    published = (
        meta(property="article:published_time")
        or meta(name="pubdate")
        or meta(name="date")
        or ""
    ).strip()
    md.published_time = published

    # If no normal <title>, try OG or Twitter titles
    if not md.title:
        md.title = (meta(property="og:title") or meta(name="twitter:title") or "").strip()

    return md


def extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
    """
    Uses the readability library to find the 'main content' of an article-like page.
    Returns a clean text string and a BeautifulSoup of the main content HTML
    (so we can also extract links from just the relevant area).
    If readability fails/misfires (like index pages), we gracefully fallback to empty text.
    """
    try:
        doc = Document(html)                     # Run Readability on the HTML
        summary_html = doc.summary()             # This is the extracted main-content HTML
        # Parse the readability summary into a soup so we can pull out links cleanly
        summary_soup = BeautifulSoup(summary_html, "lxml")
        # Turn HTML to plain text: keep paragraphs and line breaks readable
        # Remove scripts/styles etc. if any slipped through
        for tag in summary_soup(["script", "style", "noscript"]):
            tag.decompose()
        text = summary_soup.get_text("\n", strip=True)
        text = re.sub(r"\n{3,}", "\n\n", text)   # Collapse superfluous line breaks
        return text, summary_soup
    except Exception:
        # If something goes wrong (e.g., not article-shaped), return empty content
        return "", BeautifulSoup("", "lxml")


def collect_links(soup: BeautifulSoup, base_url: str, only_content_area: bool, fallback_html: str) -> List[Dict]:
    """
    Finds hyperlinks. If we have a 'main content' soup and the user asked for
    content-only links, we grab links from there; otherwise, fall back to the whole page.
    We resolve relative URLs to absolute and skip junk (javascript:, #, mailto:).
    """
    anchors = []
    if soup and only_content_area:
        anchors = soup.find_all("a")
    else:
        full = BeautifulSoup(fallback_html, "lxml")
        anchors = full.find_all("a")

    results = []
    seen = set()
    for a in anchors:
        href = (a.get("href") or "").strip()
        text = a.get_text(" ", strip=True)
        if not href:
            continue
        # Skip empty, anchors, JS, and non-http links
        if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
            continue
        # Make absolute
        absolute = urljoin(base_url, href)
        # Deduplicate by absolute URL + link text combo
        key = (absolute, text)
        if key in seen:
            continue
        seen.add(key)
        domain = urlparse(absolute).netloc
        results.append({"Text": text or "(no text)", "URL": absolute, "Domain": domain})
    return results


# ====================================
# Special-case: Hacker News front page
# ====================================

def is_hn_front(url: str) -> bool:
    """
    Checks if the URL is the Hacker News front page (news.ycombinator.com).
    We'll special-handle it for a great experience listing posts.
    """
    p = urlparse(url)
    if p.netloc != "news.ycombinator.com":
        return False
    # Treat /, /news, or /front as "front page" style
    return p.path in ("", "/", "/news", "/front")


def parse_hn_front(html: str, base_url: str) -> Tuple[str, List[Dict]]:
    """
    Parses the Hacker News front page HTML to extract ranked items with points and comments.
    Returns a Markdown overview and a list-of-dicts suitable for a table.
    """
    soup = BeautifulSoup(html, "lxml")
    items = []

    # Each story is a <tr class="athing">; subtext is in the immediate next <tr>
    for story in soup.select("tr.athing"):
        # Rank (e.g., "1.") is usually in a sibling cell, but sometimes inside
        rank_tag = story.select_one("span.rank")
        rank = (rank_tag.get_text(strip=True).replace(".", "") if rank_tag else "")

        # Title + URL (HN changed markup: 'span.titleline a' is current)
        title_a = story.select_one("span.titleline > a") or story.select_one("a.titlelink") or story.select_one("a.storylink")
        title = title_a.get_text(strip=True) if title_a else "(no title)"
        url = urljoin(base_url, title_a["href"]) if (title_a and title_a.get("href")) else base_url

        # Source domain (e.g., (github.com))
        site = story.select_one("span.sitestr")
        source = site.get_text(strip=True) if site else urlparse(url).netloc

        # Subtext row comes right after the 'athing' row
        subtext_row = story.find_next_sibling("tr")
        points, comments, age, by = "", "", "", ""
        if subtext_row:
            # Points like "123 points"
            score = subtext_row.select_one("span.score")
            points = score.get_text(strip=True) if score else ""
            # Byline: "by username"
            user_a = subtext_row.select_one("a.hnuser")
            by = user_a.get_text(strip=True) if user_a else ""
            # Age: "5 hours ago"
            age_tag = subtext_row.select_one("span.age")
            age = age_tag.get_text(strip=True) if age_tag else ""
            # Comments link: last <a> typically ends with "comments" or "discuss"
            comment_a = None
            links = subtext_row.select("a")
            if links:
                comment_a = links[-1]
            comments = (comment_a.get_text(strip=True) if comment_a else "").lower()

        items.append({
            "Rank": rank,
            "Title": title,
            "URL": url,
            "Source": source,
            "Points": points,
            "By": by,
            "Age": age,
            "Comments": comments,
        })

    # Build a tight Markdown digest so you can "use" HN inside the tool
    md_lines = ["# Hacker News — Front Page",
                "",
                "Here are the current front-page posts (click to open):",
                ""]
    for it in items:
        rank = it["Rank"] or "•"
        title = it["Title"]
        url = it["URL"]
        pts = it["Points"] or ""
        cmt = it["Comments"] or ""
        age = it["Age"] or ""
        src = it["Source"] or ""
        # Example line: "1. [Cool Project](url) — 345 points • 123 comments • 5 hours ago (github.com)"
        extras = " — ".join(filter(None, [
            " ".join(filter(None, [pts, cmt])),
            age,
            f"({src})"
        ]))
        md_lines.append(f"{rank}. [{title}]({url}){(' — ' + extras) if extras else ''}")
    md = "\n".join(md_lines) if items else "# Hacker News — No items found"

    return md, items


# ===========================
# Public function for Gradio
# ===========================

def extract_page(url: str, full_text: bool, max_links: int, content_links_only: bool) -> Tuple[str, List[Dict]]:
    """
    Main function wired to the UI.
    - Fetches the page
    - If it's Hacker News front page, parse posts specially
    - Otherwise: extract metadata, main text (optional), and links
    - Returns Markdown (summary) + a table of links
    """
    try:
        html = fetch_html(url)
    except requests.exceptions.RequestException as e:
        # Friendly error message for the UI textbox
        return f"## Error\nUnable to fetch the page.\n\n**Details:** {e}", []

    # Hacker News special handling for top-notch usability
    if is_hn_front(url):
        md, items = parse_hn_front(html, url)
        return md, items  # For HN, the table is the rich story list

    # Generic page pipeline
    soup_full = BeautifulSoup(html, "lxml")                # Full page soup for metadata and optional link fallback
    metadata = extract_metadata(soup_full, url)            # Title, canonical, description, etc.
    main_text, summary_soup = extract_main_text(html)      # Readability content (may be empty on index pages)

    # Choose where we harvest links from
    links = collect_links(summary_soup, url, content_links_only, html)
    if max_links and max_links > 0:
        links = links[:max_links]

    # Build a readable Markdown summary
    md_lines = []

    # Title line (prefer metadata title)
    title_to_show = metadata.title or "(Untitled)"
    md_lines.append(f"# {title_to_show}")

    # Canonical + URL info
    if metadata.canonical_url and metadata.canonical_url != url:
        md_lines.append(f"- **Canonical:** {metadata.canonical_url}")
    md_lines.append(f"- **URL:** {url}")

    # Optional metadata lines
    if metadata.site_name:
        md_lines.append(f"- **Site:** {metadata.site_name}")
    if metadata.description:
        md_lines.append(f"- **Description:** {metadata.description}")
    if metadata.published_time:
        md_lines.append(f"- **Published:** {metadata.published_time}")
    if metadata.og_type:
        md_lines.append(f"- **OG Type:** {metadata.og_type}")

    # Spacer
    md_lines.append("\n---\n")

    # Main content (optional, controlled by checkbox)
    if full_text and main_text:
        md_lines.append("## Main Content")
        # Keep things readable; long pages can be huge—Readability already helps keep it topical
        md_lines.append(main_text)
        md_lines.append("\n---\n")

    # Links brief (we also return a structured table below)
    md_lines.append("## Links Found")
    md_lines.append(
        f"Showing {'content-only' if content_links_only else 'all-page'} links (up to {max_links}). "
        "Click any to open in a new tab."
    )

    md = "\n".join(md_lines)
    return md, links


# ===========
# Gradio UI
# ===========

# Build a Blocks UI so we can have multiple outputs (Markdown + DataFrame) nicely arranged
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Clean Text & Links") as demo:
    # --- Header area: title + quick helper buttons
    gr.Markdown("# Fetch MCP — Clean Text & Links\n"
                "Extract clean **title**, **metadata**, **main text**, and **hyperlinks** from any page.\n\n"
                "- Special handling for **Hacker News** front page (rank, points, comments).\n"
                "- Toggle **Full Text** if you also want the extracted article content.")

    with gr.Row():
        url_in = gr.Textbox(
            label="URL",
            placeholder="https://news.ycombinator.com/  •  https://example.com/article",
            value="https://news.ycombinator.com/",
            scale=4
        )
        fetch_btn = gr.Button("Fetch / Extract", variant="primary", scale=1)

    with gr.Row():
        full_text_chk = gr.Checkbox(
            label="Include main content text (Readability extract)?",
            value=False
        )
        content_only_chk = gr.Checkbox(
            label="Links from main content only (fallback: full page)?",
            value=True
        )
        max_links_sld = gr.Slider(
            label="Max links to return",
            minimum=10, maximum=500, value=100, step=10
        )

    # Outputs: Markdown summary + a table of links (or HN posts table)
    summary_md = gr.Markdown(label="Summary")
    links_tbl = gr.Dataframe(
        headers=["Rank/—", "Title/Text", "URL", "Source/Domain", "Points", "By", "Age", "Comments"],
        # We won't pre-enforce headers strictly; DataFrame will adapt to dict keys provided.
        interactive=False,
        wrap=True,
        row_count=(0, "dynamic"),
        col_count=(0, "dynamic")
    )

    # Wire up the action: clicking the button runs extract_page and shows results
    fetch_btn.click(
        fn=extract_page,
        inputs=[url_in, full_text_chk, max_links_sld, content_only_chk],
        outputs=[summary_md, links_tbl]
    )

# Keep MCP server behavior enabled for your setup
if __name__ == "__main__":
    demo.launch(mcp_server=True)