File size: 13,098 Bytes
b708e4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f890eb5
b708e4c
 
 
 
 
 
 
 
ac9f3b0
b708e4c
 
 
 
 
ac9f3b0
b708e4c
ac9f3b0
b708e4c
 
 
 
 
ac9f3b0
b708e4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac9f3b0
 
b708e4c
 
 
 
 
 
 
 
 
 
 
 
ac9f3b0
b708e4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac9f3b0
b708e4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f890eb5
b708e4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
b708e4c
321422d
f890eb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# File: main/app.py
# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
#          instead of returning full HTML. Output is compact and configurable to reduce verbosity.

import gradio as gr                        # UI library
import requests                            # HTTP client
from bs4 import BeautifulSoup              # HTML parsing
from readability import Document           # Readability algorithm to isolate main content
from urllib.parse import urljoin, urldefrag, urlparse  # URL helpers
import re                                  # For whitespace cleanup and simple formatting


# -------------------------------
# HTTP fetching with sane defaults
# -------------------------------
def _http_get(url: str) -> requests.Response:
    """
    Make an HTTP GET request with headers and a timeout.
    Layman's terms: downloads the webpage safely and politely.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    # Short timeouts so the app isn't stuck forever
    return requests.get(url, headers=headers, timeout=15)


# ----------------------------------------
# Helpers: text cleanup & friendly trimming
# ----------------------------------------
def _normalize_whitespace(text: str) -> str:
    """
    Layman's terms: squash weird spacing and too many blank lines.
    """
    text = re.sub(r"[ \t\u00A0]+", " ", text)               # collapse runs of spaces
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())   # max 1 blank line at a time
    return text.strip()


def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
    """
    Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
    """
    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
        return text, False
    return text[:max_chars].rstrip() + " …", True


def _domain_of(url: str) -> str:
    """
    Layman's terms: show a friendly domain like example.com.
    """
    try:
        return urlparse(url).netloc or ""
    except Exception:
        return ""


# -----------------------------------
# Metadata extraction (title, etc.)
# -----------------------------------
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
    """
    Layman's terms: grab useful fields like title, description, site name, and canonical link.
    """
    meta = {}

    # Title preference: <title> > og:title > twitter:title
    title_candidates = [
        (soup.title.string if soup.title and soup.title.string else None),
        _og(soup, "og:title"),
        _meta(soup, "twitter:title"),
    ]
    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")

    # Description preference: meta[name=description] > og:description > twitter:description
    desc_candidates = [
        _meta(soup, "description"),
        _og(soup, "og:description"),
        _meta(soup, "twitter:description"),
    ]
    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")

    # Canonical URL if provided (helps dedupe / standardize)
    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""

    # Site name (nice for context)
    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()

    # Language (if present)
    html_tag = soup.find("html")
    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""

    # Final resolved URL and domain
    meta["fetched_url"] = final_url
    meta["domain"] = _domain_of(final_url)

    return meta


def _meta(soup: BeautifulSoup, name: str) -> str | None:
    tag = soup.find("meta", attrs={"name": name})
    return tag.get("content") if tag and tag.has_attr("content") else None


def _og(soup: BeautifulSoup, prop: str) -> str | None:
    tag = soup.find("meta", attrs={"property": prop})
    return tag.get("content") if tag and tag.has_attr("content") else None


# ---------------------------------------------------------
# Main content extraction with Readability + gentle cleanup
# ---------------------------------------------------------
def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
    """
    Layman's terms: use Readability to find the article body, then clean it to plain text.
    Returns (clean_text, soup_of_readable_html) for link scraping.
    """
    # Readability gives us a simplified article HTML
    doc = Document(html)
    readable_html = doc.summary(html_partial=True)

    # Parse the simplified HTML so we can clean it up further
    s = BeautifulSoup(readable_html, "lxml")

    # Remove obviously noisy elements if present
    for sel in ["script", "style", "noscript", "iframe", "svg"]:
        for tag in s.select(sel):
            tag.decompose()

    # Extract text with paragraphs preserved, then normalize whitespace
    text_parts = []
    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
        # Keep list items and headers to retain structure without being too verbose
        chunk = p.get_text(" ", strip=True)
        if chunk:
            text_parts.append(chunk)

    clean_text = _normalize_whitespace("\n\n".join(text_parts))
    return clean_text, s


# ------------------------------------------
# Link extraction from the simplified content
# ------------------------------------------
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
    """
    Layman's terms: pull out clickable links from the article content only,
    turn them into absolute URLs, drop junk, dedupe, and cap the list.
    """
    seen = set()
    links: list[tuple[str, str]] = []

    for a in readable_soup.find_all("a", href=True):
        href = a.get("href").strip()
        # Ignore anchors, mailto, javascript, and empty
        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
            continue

        # Resolve relative URLs and strip URL fragments (#section)
        absolute = urljoin(base_url, href)
        absolute, _ = urldefrag(absolute)

        if absolute in seen:
            continue
        seen.add(absolute)

        text = a.get_text(" ", strip=True)
        # Keep link text concise
        if len(text) > 120:
            text = text[:117] + "…"

        links.append((text or absolute, absolute))

        if len(links) >= max_links > 0:
            break

    return links


# -------------------------
# Formatter: compact output
# -------------------------
def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
                     include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
    """
    Layman's terms: turn the pieces into a neat, compact Markdown string.
    """
    lines = []

    # Title header
    title = meta.get("title") or meta.get("domain") or "Untitled"
    lines.append(f"# {title}")

    # Metadata (compact)
    if include_metadata:
        md = []
        # Only show fields that exist to keep things tight
        if meta.get("description"):
            md.append(f"- **Description:** {meta['description']}")
        if meta.get("site_name"):
            md.append(f"- **Site:** {meta['site_name']}")
        if meta.get("canonical"):
            md.append(f"- **Canonical:** {meta['canonical']}")
        if meta.get("lang"):
            md.append(f"- **Language:** {meta['lang']}")
        if meta.get("fetched_url"):
            md.append(f"- **Fetched From:** {meta['fetched_url']}")

        if md:
            lines.append("## Metadata")
            lines.extend(md)

    # Body text
    if include_text and body:
        # For "Brief", show a very small excerpt even after truncation
        if verbosity == "Brief":
            brief, was_more = _truncate(body, 800)
            lines.append("## Text")
            lines.append(brief)
            if was_more or body_truncated:
                lines.append("\n> (Trimmed for brevity)")
        else:
            lines.append("## Text")
            lines.append(body)
            if body_truncated:
                lines.append("\n> (Trimmed for brevity)")

    # Links
    if include_links and links:
        lines.append(f"## Links ({len(links)})")
        for text, url in links:
            lines.append(f"- [{text}]({url})")

    return "\n\n".join(lines).strip()


# --------------------------------
# Gradio-facing function (the app)
# --------------------------------
def extract_relevant(
    url: str,
    verbosity: str = "Standard",
    include_metadata: bool = True,
    include_text: bool = True,
    include_links: bool = True,
    max_chars: int = 3000,
    max_links: int = 20
) -> str:
    """
    Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
    """
    if not url or not url.strip():
        return "Please enter a valid URL."

    try:
        resp = _http_get(url)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

    # Respect the final resolved URL (after redirects)
    final_url = str(resp.url)

    # Only process HTML-ish responses
    ctype = resp.headers.get("Content-Type", "")
    if "html" not in ctype.lower():
        return f"Unsupported content type for extraction: {ctype or 'unknown'}"

    # Decode as text (requests usually sets encoding; otherwise guess)
    resp.encoding = resp.encoding or resp.apparent_encoding
    html = resp.text

    # Full page soup (to extract metadata accurately)
    full_soup = BeautifulSoup(html, "lxml")
    meta = _extract_metadata(full_soup, final_url)

    # Extract main body text using Readability
    body_text, readable_soup = _extract_main_text(html)

    # If the body is suspiciously empty, fall back to a simpler text strategy
    if not body_text:
        fallback_text = full_soup.get_text(" ", strip=True)
        body_text = _normalize_whitespace(fallback_text)

    # Enforce verbosity presets unless user overrides via slider
    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
    target_cap = preset_caps.get(verbosity, 3000)
    # Use the *smaller* of user cap and preset to keep things tidy
    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)

    # Extract links from the readable portion only (cleaner than whole DOM)
    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)

    # Build compact Markdown
    md = _format_markdown(
        meta=meta,
        body=body_text,
        body_truncated=truncated,
        links=links,
        include_text=include_text,
        include_metadata=include_metadata,
        include_links=include_links,
        verbosity=verbosity
    )
    return md or "No content could be extracted."


# -----------------
# Gradio UI (Blocks)
# -----------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    # Title & subtitle for clarity
    gr.Markdown("# Fetch MCP — Clean Extract")
    gr.Markdown(
        "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. "
        "Use Verbosity and caps to keep it tight."
    )

    with gr.Row():
        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
        fetch_btn = gr.Button("Fetch Clean Content")

    with gr.Accordion("Options", open=False):
        with gr.Row():
            verbosity = gr.Dropdown(
                label="Verbosity",
                choices=["Brief", "Standard", "Full"],
                value="Standard",
                info="Controls how much text you get back."
            )
            max_chars = gr.Slider(
                400, 12000, value=3000, step=100,
                label="Max Characters (body text)",
                info="Hard cap for body text. Lower = less verbose."
            )
            max_links = gr.Slider(
                0, 100, value=20, step=1,
                label="Max Links",
                info="Limit how many hyperlinks we include."
            )
        with gr.Row():
            include_metadata = gr.Checkbox(value=True, label="Include Metadata")
            include_text = gr.Checkbox(value=True, label="Include Main Text")
            include_links = gr.Checkbox(value=True, label="Include Links")

    # Output as Markdown (compact and readable)
    out = gr.Markdown(label="Result")

    # Wire up the click
    fetch_btn.click(
        fn=extract_relevant,
        inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
        outputs=out
    )

# Keep MCP server enabled
if __name__ == "__main__":
    demo.launch(mcp_server=True)