File size: 5,388 Bytes
32db98e ed27cf5 32db98e e7c6d66 32db98e e7c6d66 32db98e 321422d 32db98e e7c6d66 32db98e 321422d 32db98e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File: app.py
# Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
# relevant human-readable text instead of the entire HTML.
# Includes robust error handling, timeouts, and fallbacks.
import gradio as gr # UI framework
import requests # makes the web request
from bs4 import BeautifulSoup # parses HTML so we can work with it
from readability import Document # distills a page down to its "main article" content
import html # unescapes HTML entities like & → &
import re # simple cleanup with regex
# ---- helper: clean up text nicely -------------------------------------------
def _normalize_text(text: str) -> str:
"""
Layman's terms: This tidies up the text we extracted so it looks nice.
- Converts & things back to normal characters
- Collapses too many blank lines
- Trims leading/trailing whitespace
"""
text = html.unescape(text)
# Replace Windows/Mac line endings with Unix and normalize spaces
text = text.replace("\r\n", "\n").replace("\r", "\n")
# Collapse 3+ newlines down to 2
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
# ---- core fetcher: return main text or raw HTML ------------------------------
def fetch_page(url: str, extract_text: bool = True) -> str:
"""
Layman's terms: We download the web page. If 'extract_text' is True,
we try to grab only the main article/important text. Otherwise we
return the raw HTML (like your original app).
"""
try:
# Make the request with a friendly browser-like header and a timeout
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
timeout=15,
allow_redirects=True,
)
resp.raise_for_status() # If site returns 4xx/5xx, this will raise an error
except requests.exceptions.RequestException as e:
# Layman's terms: If anything goes wrong with the request, report it nicely.
return f"Request error: {e}"
# If the user wants full HTML, behave like the original version
if not extract_text:
return resp.text
# Try readability first (usually best for articles/blog posts)
try:
# readability extracts the "main" content and returns HTML of just that part
doc = Document(resp.text)
main_html = doc.summary(html_partial=True)
# Parse the article-only HTML and get just the visible text
soup = BeautifulSoup(main_html, "lxml")
# Remove script/style just in case
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
main_text = soup.get_text(separator="\n")
main_text = _normalize_text(main_text)
# Fallback: if extraction produced nearly nothing, try a simpler approach
if len(main_text.split()) < 40:
raise ValueError("Readability extraction too short; falling back")
return main_text
except Exception:
# Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
try:
soup = BeautifulSoup(resp.text, "lxml")
# Remove common noise: scripts, styles, nav, footer, header, forms
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
tag.decompose()
# If there's a <main> or an article-like block, prefer that
candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
if candidate:
text = candidate.get_text(separator="\n")
else:
text = soup.get_text(separator="\n")
return _normalize_text(text)
except Exception as e:
# Last resort: give raw HTML if even fallback parsing fails
return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"
# ---- Gradio UI ---------------------------------------------------------------
# Layman's terms: This is the app window. You paste a URL and choose whether to
# extract readable text or keep full HTML. Then click "Fetch".
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
gr.Markdown(
"""
# Fetch MCP
Small utility that fetches a web page and returns **just the readable text** by default
*(toggle off to get the full HTML like before)*.
"""
)
with gr.Row():
url_input = gr.Textbox(
label="URL",
placeholder="https://example.com/article",
lines=1,
)
with gr.Row():
extract_toggle = gr.Checkbox(
value=True,
label="Extract only the main readable text (recommended)",
)
fetch_btn = gr.Button("Fetch", variant="primary")
# Output as plain text so it’s easy to copy or pipe into other tools
output = gr.Textbox(
label="Output",
lines=20,
interactive=False,
placeholder="Fetched content will appear here…",
)
# Wire the button to our function
fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)
# Run as normal, keeping MCP server enabled
if __name__ == "__main__":
demo.launch(mcp_server=True)
|