File size: 7,524 Bytes
39ae379 e7c6d66 39ae379 e7c6d66 39ae379 e7c6d66 39ae379 e7c6d66 39ae379 321422d 39ae379 e7c6d66 321422d 39ae379 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# File: app.py
# Purpose: Fetch only the readable text from a web page and return it as Markdown
# Notes: This version is more efficient and user-friendly than returning raw HTML.
import re
import time
import gradio as gr
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup # used as a fallback cleaner
from readability import Document # isolates the "main content" like reader view
import html2text # converts HTML to Markdown
# ----------------------------
# Simple in-memory cache (tiny LRU-ish)
# ----------------------------
# layman's terms: we remember recent results so repeated requests for the same URL are instant
_CACHE = {}
_CACHE_ORDER = []
_CACHE_MAX = 64
_CACHE_TTL_SECONDS = 10 * 60 # 10 minutes
def _cache_get(key):
# layman's terms: give me the saved value if it's still fresh
item = _CACHE.get(key)
if not item:
return None
value, ts = item
if time.time() - ts > _CACHE_TTL_SECONDS:
_CACHE.pop(key, None)
return None
# refresh order
if key in _CACHE_ORDER:
_CACHE_ORDER.remove(key)
_CACHE_ORDER.append(key)
return value
def _cache_set(key, value):
# layman's terms: save a result and keep the list from growing too large
_CACHE[key] = (value, time.time())
if key in _CACHE_ORDER:
_CACHE_ORDER.remove(key)
_CACHE_ORDER.append(key)
while len(_CACHE_ORDER) > _CACHE_MAX:
oldest = _CACHE_ORDER.pop(0)
_CACHE.pop(oldest, None)
# ----------------------------
# Helpers
# ----------------------------
def _normalize_url(url: str) -> str:
"""
layman's terms: if the user forgot 'https://', add it.
"""
url = url.strip()
parsed = urlparse(url)
if not parsed.scheme:
url = "https://" + url
return url
def _too_large_via_head(url: str, max_bytes: int = 2_500_000) -> bool:
"""
layman's terms: do a quick HEAD request; if the server says the page is huge, we skip it.
"""
try:
head = requests.head(
url,
allow_redirects=True,
timeout=(5, 10),
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml",
"Accept-Encoding": "gzip, deflate, br",
},
)
size = head.headers.get("Content-Length")
if size and size.isdigit():
return int(size) > max_bytes
except requests.exceptions.RequestException:
# layman's terms: if HEAD fails, we won't block the GET just because of that
pass
return False
def _fetch_html(url: str) -> str:
"""
layman's terms: download the page HTML (not images/scripts), with a timeout and errors handled.
"""
resp = requests.get(
url,
timeout=(5, 20), # connect, read
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.8",
},
)
resp.raise_for_status()
# Only proceed for text/html payloads
ctype = resp.headers.get("Content-Type", "")
if "text/html" not in ctype.lower():
# layman's terms: if it's not a web page (maybe JSON/PDF/etc), just give raw text
return resp.text
# Respect declared encoding where possible
resp.encoding = resp.encoding or "utf-8"
return resp.text
def _extract_main_html(html: str) -> str:
"""
layman's terms: use reader mode (Readability) to isolate the main article/body content.
Falls back to stripping scripts/styles if Readability can't find a core.
"""
try:
doc = Document(html)
main_html = doc.summary(html_partial=True) # main content as HTML
# Make sure we still have something useful
if main_html and len(main_html) > 40:
return main_html
except Exception:
pass
# Fallback: strip scripts/styles and return a body-only HTML
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body = soup.body or soup
return str(body)
def _html_to_markdown(html: str) -> str:
"""
layman's terms: convert the cleaned HTML into nice Markdown with links and headings.
"""
h = html2text.HTML2Text()
h.ignore_images = True # don't inline images in Markdown
h.ignore_links = False # keep links as [text](url)
h.body_width = 0 # don't hard-wrap lines
h.protect_links = True
h.single_line_break = True
md = h.handle(html)
# Tidy up excessive blank lines/whitespace
md = re.sub(r"\n{3,}", "\n\n", md).strip()
return md or "_No readable text found on this page._"
# ----------------------------
# Main callable for Gradio
# ----------------------------
def fetch_markdown(url: str) -> str:
"""
layman's terms: the function the UI calls.
Steps:
1) sanitize the URL
2) quick HEAD check to avoid massive pages
3) GET the HTML
4) isolate the main content
5) convert to Markdown
6) return Markdown
"""
if not url or not url.strip():
return "_Please enter a URL._"
try:
url = _normalize_url(url)
# Return cached value if available
cached = _cache_get(url)
if cached:
return cached
# Optional efficiency: skip very large pages before downloading
if _too_large_via_head(url):
return "_The page is too large to fetch efficiently (over ~2.5 MB)._"
html = _fetch_html(url)
# If server returned non-HTML (e.g., JSON), just code-fence it
if "text/html" not in (requests.utils.get_encoding_from_headers({"content-type": "text/html"}) or "text/html"):
# This condition is a no-op; we already content-typed in _fetch_html.
pass
main_html = _extract_main_html(html)
markdown = _html_to_markdown(main_html)
_cache_set(url, markdown)
return markdown
except requests.exceptions.RequestException as e:
# layman's terms: network or HTTP error
return f"_Network error: {e}_"
except Exception as e:
# layman's terms: any other unexpected error
return f"_Unexpected error: {e}_"
# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Markdown") as demo:
# layman's terms: a simple, centered header explaining what this tool does
gr.Markdown("# Fetch MCP (Markdown)\nFetch a page and show just its readable text as Markdown.")
with gr.Row():
url_box = gr.Textbox(
label="URL",
placeholder="example.com or https://example.com/article",
)
fetch_btn = gr.Button("Fetch")
# layman's terms: show the result as rendered Markdown (not a plain textbox)
output_md = gr.Markdown(label="Readable Markdown")
# layman's terms: helpful example URLs to try with one click
gr.Examples(
examples=[
["https://en.wikipedia.org/wiki/Hugging_Face"],
["https://huggingface.co/blog"],
["https://www.bbc.com/news"],
],
inputs=[url_box],
)
fetch_btn.click(fetch_markdown, inputs=url_box, outputs=output_md)
url_box.submit(fetch_markdown, inputs=url_box, outputs=output_md)
if __name__ == "__main__":
demo.launch(mcp_server=True)
|