|
|
|
|
|
|
|
import re |
|
import gradio as gr |
|
import requests |
|
from urllib.parse import urlparse |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
try: |
|
import trafilatura |
|
except Exception: |
|
trafilatura = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_url(url: str) -> str: |
|
""" |
|
Make sure the URL has a scheme; default to https:// if missing. |
|
This avoids 'Invalid URL' errors for inputs like 'example.com'. |
|
""" |
|
url = (url or "").strip() |
|
if not url: |
|
raise ValueError("Please enter a URL.") |
|
parsed = urlparse(url) |
|
if not parsed.scheme: |
|
url = "https://" + url |
|
return url |
|
|
|
|
|
def _fetch(url: str, timeout: int = 15) -> requests.Response: |
|
""" |
|
Fetch the page with a reasonable User-Agent and a timeout. |
|
We allow redirects and raise on HTTP errors for clearer feedback. |
|
""" |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (compatible; SmartTextFetcher/1.0; +https://huggingface.co/spaces)", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
} |
|
resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True) |
|
resp.raise_for_status() |
|
return resp |
|
|
|
|
|
def _extract_title_from_html(html: str) -> str | None: |
|
""" |
|
Pull the <title> tag text, if present, for a nicer header. |
|
""" |
|
try: |
|
soup = BeautifulSoup(html, "lxml") |
|
except Exception: |
|
soup = BeautifulSoup(html, "html.parser") |
|
title_tag = soup.find("title") |
|
if title_tag and title_tag.string: |
|
return title_tag.string.strip() |
|
return None |
|
|
|
|
|
def _visible_text_from_html(html: str) -> str: |
|
""" |
|
Fallback extractor: strip scripts/styles/nav/ads and return visible text. |
|
This is a heuristic but works well when the smart extractor isn't available. |
|
""" |
|
try: |
|
soup = BeautifulSoup(html, "lxml") |
|
except Exception: |
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
for tag in soup(["script", "style", "noscript", "svg", "path", "form", |
|
"header", "footer", "nav", "aside", "iframe"]): |
|
tag.decompose() |
|
|
|
|
|
kill_words = ( |
|
"nav", "menu", "footer", "header", "cookie", "banner", "subscribe", |
|
"newsletter", "sidebar", "social", "share", "comment", "promo", |
|
"advert", "ad", "breadcrumbs", "breadcrumb" |
|
) |
|
for el in soup.find_all(True): |
|
meta = " ".join(el.get("class", []) + [el.get("id", "")]).lower() |
|
if any(k in meta for k in kill_words): |
|
el.decompose() |
|
|
|
|
|
main = soup.find("article") or soup.find("main") or soup.body or soup |
|
|
|
|
|
blocks = main.find_all(["h1","h2","h3","h4","h5","h6","p","li","blockquote"]) |
|
lines = [] |
|
for b in blocks: |
|
text = b.get_text(" ", strip=True) |
|
if len(text) >= 3: |
|
lines.append(text) |
|
|
|
text = "\n\n".join(lines) if lines else main.get_text(" ", strip=True) |
|
|
|
|
|
text = re.sub(r"\n{3,}", "\n\n", text) |
|
text = re.sub(r"[ \t]{2,}", " ", text) |
|
return text.strip() |
|
|
|
|
|
def _smart_main_text(html: str, url: str) -> str | None: |
|
""" |
|
Use Trafilatura to pull the main/article text when available. |
|
Returns None if extraction fails. |
|
""" |
|
if not trafilatura: |
|
return None |
|
try: |
|
|
|
extracted = trafilatura.extract( |
|
html, |
|
include_comments=False, |
|
favor_recall=True, |
|
url=url |
|
) |
|
return (extracted or None) |
|
except Exception: |
|
return None |
|
|
|
|
|
def _truncate(text: str, max_chars: int) -> str: |
|
""" |
|
Optional safety guard so outputs stay small and responsive. |
|
""" |
|
if max_chars is None or max_chars <= 0: |
|
return text |
|
if len(text) <= max_chars: |
|
return text |
|
return text[:max_chars].rstrip() + "\n\n… [truncated]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_relevant_text( |
|
url: str, |
|
mode: str = "Main article (smart)", |
|
max_chars: int = 8000, |
|
include_title: bool = True |
|
) -> str: |
|
""" |
|
Main entry point powered by the UI. |
|
- Validates the URL |
|
- Fetches the page |
|
- Extracts relevant text based on the selected mode |
|
- Optionally prefixes the page <title> |
|
""" |
|
try: |
|
url = _normalize_url(url) |
|
resp = _fetch(url) |
|
content_type = (resp.headers.get("Content-Type") or "").lower() |
|
|
|
|
|
if "text/plain" in content_type and resp.text: |
|
text = resp.text.strip() |
|
|
|
|
|
elif "text/html" in content_type or "application/xhtml+xml" in content_type or "<html" in resp.text.lower(): |
|
html = resp.text |
|
|
|
if mode.startswith("Main article"): |
|
text = _smart_main_text(html, url) or _visible_text_from_html(html) |
|
elif mode.startswith("Visible text"): |
|
text = _visible_text_from_html(html) |
|
else: |
|
text = html |
|
|
|
|
|
if include_title and not mode.startswith("Raw HTML"): |
|
title = _extract_title_from_html(html) |
|
if title: |
|
text = f"{title}\n\n{text}".strip() |
|
|
|
else: |
|
|
|
return f"Unsupported content type: {content_type or 'unknown'}. This tool extracts text from HTML pages." |
|
|
|
|
|
return _truncate(text, max_chars) |
|
|
|
except requests.exceptions.RequestException as e: |
|
return f"Network error while fetching the URL: {e}" |
|
except ValueError as ve: |
|
return f"{ve}" |
|
except Exception as e: |
|
return f"Unexpected error: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP — Smart Text") as demo: |
|
|
|
gr.Markdown( |
|
""" |
|
# Fetch MCP — Smart Text |
|
Enter a URL and get the **relevant text** back (not the raw HTML). |
|
Use “Main article (smart)” for best results; switch to “Visible text” if needed. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
url_in = gr.Textbox(label="URL", placeholder="https://example.com/some-article", scale=4) |
|
with gr.Row(): |
|
mode_in = gr.Radio( |
|
label="Extraction mode", |
|
choices=[ |
|
"Main article (smart)", |
|
"Visible text (fallback)", |
|
"Raw HTML (debug)" |
|
], |
|
value="Main article (smart)", |
|
scale=3 |
|
) |
|
include_title_in = gr.Checkbox(label="Include page title", value=True, scale=1) |
|
max_chars_in = gr.Slider( |
|
label="Max characters (to keep responses fast)", |
|
minimum=500, |
|
maximum=40000, |
|
step=500, |
|
value=8000, |
|
scale=3 |
|
) |
|
|
|
out = gr.Textbox(label="Extracted Text", lines=22) |
|
|
|
go = gr.Button("Fetch") |
|
go.click(fetch_relevant_text, inputs=[url_in, mode_in, max_chars_in, include_title_in], outputs=out) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|