|
|
|
|
|
|
|
|
|
import re |
|
import time |
|
import gradio as gr |
|
import requests |
|
from urllib.parse import urlparse |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
import html2text |
|
|
|
|
|
|
|
|
|
|
|
_CACHE = {} |
|
_CACHE_ORDER = [] |
|
_CACHE_MAX = 64 |
|
_CACHE_TTL_SECONDS = 10 * 60 |
|
|
|
def _cache_get(key): |
|
|
|
item = _CACHE.get(key) |
|
if not item: |
|
return None |
|
value, ts = item |
|
if time.time() - ts > _CACHE_TTL_SECONDS: |
|
_CACHE.pop(key, None) |
|
return None |
|
|
|
if key in _CACHE_ORDER: |
|
_CACHE_ORDER.remove(key) |
|
_CACHE_ORDER.append(key) |
|
return value |
|
|
|
def _cache_set(key, value): |
|
|
|
_CACHE[key] = (value, time.time()) |
|
if key in _CACHE_ORDER: |
|
_CACHE_ORDER.remove(key) |
|
_CACHE_ORDER.append(key) |
|
while len(_CACHE_ORDER) > _CACHE_MAX: |
|
oldest = _CACHE_ORDER.pop(0) |
|
_CACHE.pop(oldest, None) |
|
|
|
|
|
|
|
|
|
|
|
def _normalize_url(url: str) -> str: |
|
""" |
|
layman's terms: if the user forgot 'https://', add it. |
|
""" |
|
url = url.strip() |
|
parsed = urlparse(url) |
|
if not parsed.scheme: |
|
url = "https://" + url |
|
return url |
|
|
|
def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool: |
|
""" |
|
layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it. |
|
""" |
|
try: |
|
head = requests.head( |
|
url, |
|
allow_redirects=True, |
|
timeout=(5, 10), |
|
headers={ |
|
"User-Agent": "Mozilla/5.0", |
|
"Accept": "text/html,application/xhtml+xml", |
|
"Accept-Encoding": "gzip, deflate, br", |
|
}, |
|
) |
|
size = head.headers.get("Content-Length") |
|
if size and size.isdigit(): |
|
return int(size) > max_bytes |
|
except requests.exceptions.RequestException: |
|
|
|
pass |
|
return False |
|
|
|
def _fetch_html(url: str) -> str: |
|
""" |
|
layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled. |
|
""" |
|
resp = requests.get( |
|
url, |
|
timeout=(5, 20), |
|
headers={ |
|
"User-Agent": "Mozilla/5.0", |
|
"Accept": "text/html,application/xhtml+xml", |
|
"Accept-Encoding": "gzip, deflate, br", |
|
"Accept-Language": "en-US,en;q=0.8", |
|
}, |
|
) |
|
resp.raise_for_status() |
|
|
|
|
|
ctype = resp.headers.get("Content-Type", "") |
|
if "text/html" not in ctype.lower(): |
|
|
|
return resp.text |
|
|
|
|
|
resp.encoding = resp.encoding or "utf-8" |
|
return resp.text |
|
|
|
def _extract_main_html(html: str) -> str: |
|
""" |
|
layman's terms: use reader mode (Readability) to isolate the main article/body content. |
|
Falls back to stripping scripts/styles if Readability can't find a core. |
|
""" |
|
try: |
|
doc = Document(html) |
|
main_html = doc.summary(html_partial=True) |
|
|
|
if main_html and len(main_html) > 40: |
|
return main_html |
|
except Exception: |
|
pass |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
for tag in soup(["script", "style", "noscript"]): |
|
tag.decompose() |
|
body = soup.body or soup |
|
return str(body) |
|
|
|
def _html_to_markdown(html: str) -> str: |
|
""" |
|
layman's terms: convert the cleaned HTML into nice Markdown with links and headings. |
|
""" |
|
h = html2text.HTML2Text() |
|
h.ignore_images = True |
|
h.ignore_links = False |
|
h.body_width = 0 |
|
h.protect_links = True |
|
h.single_line_break = True |
|
md = h.handle(html) |
|
|
|
|
|
md = re.sub(r"\n{3,}", "\n\n", md).strip() |
|
return md or "_No readable text found on this page._" |
|
|
|
|
|
|
|
|
|
|
|
def fetch_markdown(url: str) -> str: |
|
""" |
|
layman's terms: the function the UI calls. |
|
Steps: |
|
1) sanitize the URL |
|
2) quick HEAD check to avoid massive pages |
|
3) GET the HTML |
|
4) isolate the main content |
|
5) convert to Markdown |
|
6) return Markdown |
|
""" |
|
if not url or not url.strip(): |
|
return "_Please enter a URL._" |
|
|
|
try: |
|
url = _normalize_url(url) |
|
|
|
|
|
cached = _cache_get(url) |
|
if cached: |
|
return cached |
|
|
|
|
|
if _too_large_via_head(url): |
|
return "_The page is too large to fetch efficiently (over ~2.5 MB)._" |
|
|
|
html = _fetch_html(url) |
|
|
|
if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"): |
|
|
|
pass |
|
|
|
main_html = _extract_main_html(html) |
|
markdown = _html_to_markdown(main_html) |
|
|
|
_cache_set(url, markdown) |
|
return markdown |
|
|
|
except requests.exceptions.RequestException as e: |
|
|
|
return f"_Network error: {e}_" |
|
except Exception as e: |
|
|
|
return f"_Unexpected error: {e}_" |
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo: |
|
|
|
gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.") |
|
|
|
with gr.Row(): |
|
url_box = gr.Textbox( |
|
label="URL", |
|
placeholder="example.com or https://example.com/article", |
|
) |
|
fetch_btn = gr.Button("Fetch") |
|
|
|
|
|
output_md = gr.Markdown(label="Readable Markdown") |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
["https://en.wikipedia.org/wiki/Hugging_Face"], |
|
["https://huggingface.co/blog"], |
|
["https://www.bbc.com/news"], |
|
], |
|
inputs=[url_box], |
|
) |
|
|
|
fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md) |
|
url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|