File size: 7,524 Bytes
39ae379
 
 
 
 
 
e7c6d66
 
39ae379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c6d66
39ae379
 
 
 
 
 
 
 
 
 
 
e7c6d66
39ae379
 
e7c6d66
 
39ae379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321422d
39ae379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c6d66
321422d
39ae379
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File: app.py
# Purpose: Fetch only the readable text from a web page and return it as Markdown
# Notes: This version is more efficient and user-friendly than returning raw HTML.

import re
import time
import gradio as gr
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup  # used as a fallback cleaner
from readability import Document  # isolates the "main content" like reader view
import html2text  # converts HTML to Markdown

# ----------------------------
# Simple in-memory cache (tiny LRU-ish)
# ----------------------------
# layman's terms: we remember recent results so repeated requests for the same URL are instant
_CACHE = {}
_CACHE_ORDER = []
_CACHE_MAX = 64
_CACHE_TTL_SECONDS = 10 * 60  # 10 minutes

def _cache_get(key):
    # layman's terms: give me the saved value if it's still fresh
    item = _CACHE.get(key)
    if not item:
        return None
    value, ts = item
    if time.time() - ts > _CACHE_TTL_SECONDS:
        _CACHE.pop(key, None)
        return None
    # refresh order
    if key in _CACHE_ORDER:
        _CACHE_ORDER.remove(key)
    _CACHE_ORDER.append(key)
    return value

def _cache_set(key, value):
    # layman's terms: save a result and keep the list from growing too large
    _CACHE[key] = (value, time.time())
    if key in _CACHE_ORDER:
        _CACHE_ORDER.remove(key)
    _CACHE_ORDER.append(key)
    while len(_CACHE_ORDER) > _CACHE_MAX:
        oldest = _CACHE_ORDER.pop(0)
        _CACHE.pop(oldest, None)

# ----------------------------
# Helpers
# ----------------------------

def _normalize_url(url: str) -> str:
    """
    layman's terms: if the user forgot 'https://', add it.
    """
    url = url.strip()
    parsed = urlparse(url)
    if not parsed.scheme:
        url = "https://" + url
    return url

def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
    """
    layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
    """
    try:
        head = requests.head(
            url,
            allow_redirects=True,
            timeout=(5, 10),
            headers={
                "User-Agent": "Mozilla/5.0",
                "Accept": "text/html,application/xhtml+xml",
                "Accept-Encoding": "gzip, deflate, br",
            },
        )
        size = head.headers.get("Content-Length")
        if size and size.isdigit():
            return int(size) > max_bytes
    except requests.exceptions.RequestException:
        # layman's terms: if HEAD fails, we won't block the GET just because of that
        pass
    return False

def _fetch_html(url: str) -> str:
    """
    layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
    """
    resp = requests.get(
        url,
        timeout=(5, 20),  # connect, read
        headers={
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html,application/xhtml+xml",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.8",
        },
    )
    resp.raise_for_status()

    # Only proceed for text/html payloads
    ctype = resp.headers.get("Content-Type", "")
    if "text/html" not in ctype.lower():
        # layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
        return resp.text

    # Respect declared encoding where possible
    resp.encoding = resp.encoding or "utf-8"
    return resp.text

def _extract_main_html(html: str) -> str:
    """
    layman's terms: use reader mode (Readability) to isolate the main article/body content.
    Falls back to stripping scripts/styles if Readability can't find a core.
    """
    try:
        doc = Document(html)
        main_html = doc.summary(html_partial=True)  # main content as HTML
        # Make sure we still have something useful
        if main_html and len(main_html) > 40:
            return main_html
    except Exception:
        pass

    # Fallback: strip scripts/styles and return a body-only HTML
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    body = soup.body or soup
    return str(body)

def _html_to_markdown(html: str) -> str:
    """
    layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
    """
    h = html2text.HTML2Text()
    h.ignore_images = True          # don't inline images in Markdown
    h.ignore_links = False          # keep links as [text](url)
    h.body_width = 0                # don't hard-wrap lines
    h.protect_links = True
    h.single_line_break = True
    md = h.handle(html)

    # Tidy up excessive blank lines/whitespace
    md = re.sub(r"\n{3,}", "\n\n", md).strip()
    return md or "_No readable text found on this page._"

# ----------------------------
# Main callable for Gradio
# ----------------------------

def fetch_markdown(url: str) -> str:
    """
    layman's terms: the function the UI calls.
    Steps:
      1) sanitize the URL
      2) quick HEAD check to avoid massive pages
      3) GET the HTML
      4) isolate the main content
      5) convert to Markdown
      6) return Markdown
    """
    if not url or not url.strip():
        return "_Please enter a URL._"

    try:
        url = _normalize_url(url)

        # Return cached value if available
        cached = _cache_get(url)
        if cached:
            return cached

        # Optional efficiency: skip very large pages before downloading
        if _too_large_via_head(url):
            return "_The page is too large to fetch efficiently (over ~2.5 MB)._"

        html = _fetch_html(url)
        # If server returned non-HTML (e.g., JSON), just code-fence it
        if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
            # This condition is a no-op; we already content-typed in _fetch_html.
            pass

        main_html = _extract_main_html(html)
        markdown = _html_to_markdown(main_html)

        _cache_set(url, markdown)
        return markdown

    except requests.exceptions.RequestException as e:
        # layman's terms: network or HTTP error
        return f"_Network error: {e}_"
    except Exception as e:
        # layman's terms: any other unexpected error
        return f"_Unexpected error: {e}_"

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
    # layman's terms: a simple, centered header explaining what this tool does
    gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")

    with gr.Row():
        url_box = gr.Textbox(
            label="URL",
            placeholder="example.com or https://example.com/article",
        )
        fetch_btn = gr.Button("Fetch")

    # layman's terms: show the result as rendered Markdown (not a plain textbox)
    output_md = gr.Markdown(label="Readable Markdown")

    # layman's terms: helpful example URLs to try with one click
    gr.Examples(
        examples=[
            ["https://en.wikipedia.org/wiki/Hugging_Face"],
            ["https://huggingface.co/blog"],
            ["https://www.bbc.com/news"],
        ],
        inputs=[url_box],
    )

    fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
    url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)

if __name__ == "__main__":
    demo.launch(mcp_server=True)