|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
import re |
|
import json |
|
from typing import List, Dict, Literal, Tuple |
|
|
|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
from urllib.parse import urljoin, urldefrag, urlparse |
|
from langchain_community.tools import DuckDuckGoSearchResults |
|
from duckduckgo_search import DDGS |
|
|
|
|
|
|
|
|
|
|
|
|
|
def _http_get(url: str) -> requests.Response: |
|
""" |
|
Download the page politely with a short timeout and realistic headers. |
|
(Layman's terms: grab the web page like a normal browser would, but quickly.) |
|
""" |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
} |
|
return requests.get(url, headers=headers, timeout=15) |
|
|
|
|
|
def _normalize_whitespace(text: str) -> str: |
|
""" |
|
Squeeze extra spaces and blank lines to keep things compact. |
|
(Layman's terms: tidy up the text so it’s not full of weird spacing.) |
|
""" |
|
text = re.sub(r"[ \t\u00A0]+", " ", text) |
|
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) |
|
return text.strip() |
|
|
|
|
|
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]: |
|
""" |
|
Cut text if it gets too long; return the text and whether we trimmed. |
|
(Layman's terms: shorten long text and tell us if we had to cut it.) |
|
""" |
|
if max_chars is None or max_chars <= 0 or len(text) <= max_chars: |
|
return text, False |
|
return text[:max_chars].rstrip() + " …", True |
|
|
|
|
|
def _shorten(text: str, limit: int) -> str: |
|
""" |
|
Hard cap a string with an ellipsis to keep tokens small. |
|
(Layman's terms: force a string to a max length with an ellipsis.) |
|
""" |
|
if limit <= 0 or len(text) <= limit: |
|
return text |
|
return text[: max(0, limit - 1)].rstrip() + "…" |
|
|
|
|
|
def _domain_of(url: str) -> str: |
|
""" |
|
Show a friendly site name like "example.com". |
|
(Layman's terms: pull the website's domain.) |
|
""" |
|
try: |
|
return urlparse(url).netloc or "" |
|
except Exception: |
|
return "" |
|
|
|
|
|
def _meta(soup: BeautifulSoup, name: str) -> str | None: |
|
tag = soup.find("meta", attrs={"name": name}) |
|
return tag.get("content") if tag and tag.has_attr("content") else None |
|
|
|
|
|
def _og(soup: BeautifulSoup, prop: str) -> str | None: |
|
tag = soup.find("meta", attrs={"property": prop}) |
|
return tag.get("content") if tag and tag.has_attr("content") else None |
|
|
|
|
|
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]: |
|
""" |
|
Pull the useful bits: title, description, site name, canonical URL, language, etc. |
|
(Layman's terms: gather page basics like title/description/address.) |
|
""" |
|
meta: Dict[str, str] = {} |
|
|
|
|
|
title_candidates = [ |
|
(soup.title.string if soup.title and soup.title.string else None), |
|
_og(soup, "og:title"), |
|
_meta(soup, "twitter:title"), |
|
] |
|
meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "") |
|
|
|
|
|
desc_candidates = [ |
|
_meta(soup, "description"), |
|
_og(soup, "og:description"), |
|
_meta(soup, "twitter:description"), |
|
] |
|
meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "") |
|
|
|
|
|
link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v) |
|
meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else "" |
|
|
|
|
|
meta["site_name"] = (_og(soup, "og:site_name") or "").strip() |
|
html_tag = soup.find("html") |
|
meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else "" |
|
|
|
|
|
meta["fetched_url"] = final_url |
|
meta["domain"] = _domain_of(final_url) |
|
|
|
return meta |
|
|
|
|
|
def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]: |
|
""" |
|
Use Readability to isolate the main article and turn it into clean text. |
|
Returns (clean_text, soup_of_readable_html). |
|
(Layman's terms: find the real article text and clean it.) |
|
""" |
|
|
|
doc = Document(html) |
|
readable_html = doc.summary(html_partial=True) |
|
|
|
|
|
s = BeautifulSoup(readable_html, "lxml") |
|
|
|
|
|
for sel in ["script", "style", "noscript", "iframe", "svg"]: |
|
for tag in s.select(sel): |
|
tag.decompose() |
|
|
|
|
|
text_parts: List[str] = [] |
|
for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]): |
|
chunk = p.get_text(" ", strip=True) |
|
if chunk: |
|
text_parts.append(chunk) |
|
|
|
clean_text = _normalize_whitespace("\n\n".join(text_parts)) |
|
return clean_text, s |
|
|
|
|
|
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]: |
|
""" |
|
Collect clean, unique, absolute links from the readable section only. |
|
(Layman's terms: pull a tidy list of links from the article body.) |
|
""" |
|
seen = set() |
|
links: List[Tuple[str, str]] = [] |
|
|
|
for a in readable_soup.find_all("a", href=True): |
|
href = a.get("href").strip() |
|
|
|
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"): |
|
continue |
|
|
|
|
|
absolute = urljoin(base_url, href) |
|
absolute, _ = urldefrag(absolute) |
|
|
|
if absolute in seen: |
|
continue |
|
seen.add(absolute) |
|
|
|
text = a.get_text(" ", strip=True) |
|
if len(text) > 120: |
|
text = text[:117] + "…" |
|
|
|
links.append((text or absolute, absolute)) |
|
|
|
if len(links) >= max_links > 0: |
|
break |
|
|
|
return links |
|
|
|
|
|
def _format_markdown( |
|
meta: Dict[str, str], |
|
body: str, |
|
body_truncated: bool, |
|
links: List[Tuple[str, str]], |
|
include_text: bool, |
|
include_metadata: bool, |
|
include_links: bool, |
|
verbosity: str, |
|
) -> str: |
|
""" |
|
Assemble a compact Markdown summary with optional sections. |
|
(Layman's terms: build the final markdown output with options.) |
|
""" |
|
lines: List[str] = [] |
|
|
|
|
|
title = meta.get("title") or meta.get("domain") or "Untitled" |
|
lines.append(f"# {title}") |
|
|
|
|
|
if include_metadata: |
|
md: List[str] = [] |
|
if meta.get("description"): |
|
md.append(f"- **Description:** {meta['description']}") |
|
if meta.get("site_name"): |
|
md.append(f"- **Site:** {meta['site_name']}") |
|
if meta.get("canonical"): |
|
md.append(f"- **Canonical:** {meta['canonical']}") |
|
if meta.get("lang"): |
|
md.append(f"- **Language:** {meta['lang']}") |
|
if meta.get("fetched_url"): |
|
md.append(f"- **Fetched From:** {meta['fetched_url']}") |
|
if md: |
|
lines.append("## Metadata") |
|
lines.extend(md) |
|
|
|
|
|
if include_text and body: |
|
if verbosity == "Brief": |
|
brief, was_more = _truncate(body, 800) |
|
lines.append("## Text") |
|
lines.append(brief) |
|
if was_more or body_truncated: |
|
lines.append("\n> (Trimmed for brevity)") |
|
else: |
|
lines.append("## Text") |
|
lines.append(body) |
|
if body_truncated: |
|
lines.append("\n> (Trimmed for brevity)") |
|
|
|
|
|
if include_links and links: |
|
lines.append(f"## Links ({len(links)})") |
|
for text, url in links: |
|
lines.append(f"- [{text}]({url})") |
|
|
|
return "\n\n".join(lines).strip() |
|
|
|
|
|
def Fetch_Webpage( |
|
url: str, |
|
verbosity: str = "Standard", |
|
include_metadata: bool = True, |
|
include_text: bool = True, |
|
include_links: bool = True, |
|
max_chars: int = 3000, |
|
max_links: int = 20, |
|
) -> str: |
|
""" |
|
Fetch a web page and return a compact Markdown summary that includes title, key |
|
metadata, readable main text, and outbound links. |
|
|
|
(Layman's terms: summarize a page with clean text + useful details.) |
|
""" |
|
if not url or not url.strip(): |
|
return "Please enter a valid URL." |
|
|
|
try: |
|
resp = _http_get(url) |
|
resp.raise_for_status() |
|
except requests.exceptions.RequestException as e: |
|
return f"An error occurred: {e}" |
|
|
|
final_url = str(resp.url) |
|
ctype = resp.headers.get("Content-Type", "") |
|
if "html" not in ctype.lower(): |
|
return f"Unsupported content type for extraction: {ctype or 'unknown'}" |
|
|
|
|
|
resp.encoding = resp.encoding or resp.apparent_encoding |
|
html = resp.text |
|
|
|
|
|
full_soup = BeautifulSoup(html, "lxml") |
|
meta = _extract_metadata(full_soup, final_url) |
|
|
|
|
|
body_text, readable_soup = _extract_main_text(html) |
|
if not body_text: |
|
|
|
fallback_text = full_soup.get_text(" ", strip=True) |
|
body_text = _normalize_whitespace(fallback_text) |
|
|
|
|
|
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999} |
|
target_cap = preset_caps.get(verbosity, 3000) |
|
cap = min(max_chars if max_chars > 0 else target_cap, target_cap) |
|
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False) |
|
|
|
|
|
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0) |
|
|
|
|
|
md = _format_markdown( |
|
meta=meta, |
|
body=body_text, |
|
body_truncated=truncated, |
|
links=links, |
|
include_text=include_text, |
|
include_metadata=include_metadata, |
|
include_links=include_links, |
|
verbosity=verbosity, |
|
) |
|
return md or "No content could be extracted." |
|
|
|
|
|
|
|
|
|
|
|
|
|
def Search_Structured( |
|
input_query: str, |
|
max_results: int = 5, |
|
) -> List[Dict[Literal["snippet", "title", "link"], str]]: |
|
""" |
|
Run a DuckDuckGo search and return structured results as a list of dictionaries. |
|
(Layman's terms: search DDG and get clean JSON objects.) |
|
""" |
|
if not input_query or not input_query.strip(): |
|
return [] |
|
|
|
|
|
search = DuckDuckGoSearchResults(output_format="list", num_results=max_results) |
|
|
|
|
|
results = search.invoke(input_query) |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
def Search_Raw( |
|
query: str, |
|
) -> list[dict]: |
|
""" |
|
Run a DuckDuckGo search using the native `duckduckgo_search` client and return the |
|
raw Python list of dictionaries from the library. |
|
(Layman's terms: search DDG and show exactly what the library returns.) |
|
""" |
|
if not query or not query.strip(): |
|
return [] |
|
with DDGS() as ddgs: |
|
results = ddgs.text(query, max_results=5) |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
def Search_Concise( |
|
query: str, |
|
max_results: int = 5, |
|
include_snippets: bool = False, |
|
max_snippet_chars: int = 80, |
|
dedupe_domains: bool = True, |
|
title_chars: int = 80, |
|
) -> str: |
|
""" |
|
Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to |
|
minimize tokens. |
|
|
|
(Layman's terms: the tiniest useful search output possible.) |
|
""" |
|
if not query or not query.strip(): |
|
return "" |
|
|
|
try: |
|
with DDGS() as ddgs: |
|
raw = ddgs.text(query, max_results=max_results) |
|
except Exception as e: |
|
return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":")) |
|
|
|
seen_domains = set() |
|
lines: List[str] = [] |
|
|
|
for r in raw or []: |
|
title = _shorten((r.get("title") or "").strip(), title_chars) |
|
url = (r.get("href") or r.get("link") or "").strip() |
|
body = (r.get("body") or r.get("snippet") or "").strip() |
|
|
|
if not url: |
|
continue |
|
|
|
if dedupe_domains: |
|
dom = _domain_of(url) |
|
if dom in seen_domains: |
|
continue |
|
seen_domains.add(dom) |
|
|
|
obj = {"t": title or _domain_of(url), "u": url} |
|
|
|
if include_snippets and body: |
|
obj["s"] = _shorten(body, max_snippet_chars) |
|
|
|
|
|
lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":"))) |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def Generate_Sitemap( |
|
url: str, |
|
max_links_per_domain: int = 0, |
|
) -> str: |
|
""" |
|
Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional |
|
per-domain cap. |
|
|
|
Args: |
|
url (str): The starting page URL (http/https). If the scheme is omitted, |
|
https is assumed. |
|
max_links_per_domain (int): Limit the number of links shown per domain. |
|
Use 0 to show all links. |
|
|
|
Returns: |
|
str: Markdown text containing grouped links under "Internal Links" and |
|
per-domain "External Links (domain)" sections. If an error occurs or no |
|
links are found, a short message is returned. |
|
""" |
|
|
|
if not url or not url.strip(): |
|
return "Please enter a valid URL." |
|
|
|
|
|
if not url.lower().startswith(("http://", "https://")): |
|
url = "https://" + url.strip() |
|
|
|
|
|
try: |
|
resp = _http_get(url) |
|
resp.raise_for_status() |
|
except requests.exceptions.RequestException as e: |
|
return f"Error fetching URL: {str(e)}" |
|
|
|
base_url = str(resp.url) |
|
content_type = resp.headers.get("Content-Type", "") |
|
if "html" not in content_type.lower(): |
|
return "The provided URL does not appear to be an HTML page." |
|
|
|
|
|
soup = BeautifulSoup(resp.content, "lxml") |
|
anchors = soup.find_all("a", href=True) |
|
|
|
seen_urls: set[str] = set() |
|
items: List[Dict[str, str]] = [] |
|
|
|
for a in anchors: |
|
href = (a.get("href") or "").strip() |
|
if not href: |
|
continue |
|
|
|
|
|
if href.startswith(("#", "javascript:", "mailto:", "tel:")): |
|
continue |
|
|
|
|
|
absolute = urljoin(base_url, href) |
|
absolute, _ = urldefrag(absolute) |
|
|
|
|
|
if absolute in seen_urls or absolute == base_url: |
|
continue |
|
seen_urls.add(absolute) |
|
|
|
|
|
text = (a.get_text(" ", strip=True) or href).strip() |
|
if len(text) > 100: |
|
text = text[:100] + "..." |
|
|
|
items.append({"text": text, "url": absolute}) |
|
|
|
if not items: |
|
return "No links found on this page." |
|
|
|
|
|
base_netloc = urlparse(base_url).netloc |
|
domain_groups: Dict[str, List[Dict[str, str]]] = {} |
|
|
|
for it in items: |
|
netloc = urlparse(it["url"]).netloc |
|
key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})" |
|
domain_groups.setdefault(key, []).append(it) |
|
|
|
|
|
total_links = len(items) |
|
md_lines: List[str] = [] |
|
md_lines.append("# Sitemap") |
|
md_lines.append(f"Base URL: {base_url}") |
|
md_lines.append(f"Found {total_links} links:\n") |
|
|
|
|
|
keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"]) |
|
|
|
for group_key in keys_sorted: |
|
if group_key not in domain_groups: |
|
continue |
|
|
|
group_links = domain_groups[group_key] |
|
md_lines.append(f"## {group_key}\n") |
|
|
|
if max_links_per_domain and max_links_per_domain > 0: |
|
links_to_show = group_links[:max_links_per_domain] |
|
remaining = max(0, len(group_links) - max_links_per_domain) |
|
else: |
|
links_to_show = group_links |
|
remaining = 0 |
|
|
|
for link in links_to_show: |
|
md_lines.append(f"- [{link['text']}]({link['url']})") |
|
|
|
if remaining > 0: |
|
md_lines.append(f"- ... and {remaining} more links") |
|
|
|
md_lines.append("") |
|
|
|
sitemap_md = "\n".join(md_lines).strip() |
|
return sitemap_md |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fetch_interface = gr.Interface( |
|
fn=Fetch_Webpage, |
|
inputs=[ |
|
gr.Textbox(label="URL", placeholder="https://example.com/article"), |
|
gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"), |
|
gr.Checkbox(value=True, label="Include Metadata"), |
|
gr.Checkbox(value=True, label="Include Main Text"), |
|
gr.Checkbox(value=True, label="Include Links"), |
|
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"), |
|
gr.Slider(0, 100, value=20, step=1, label="Max Links"), |
|
], |
|
outputs=gr.Markdown(label="Extracted Summary"), |
|
title="Fetch Webpage", |
|
description="Extract title, key metadata, readable text, and links. No noisy HTML.", |
|
api_description=( |
|
"Fetch a web page and return a compact Markdown summary with title, key " |
|
"metadata, readable body text, and outbound links. Parameters let you " |
|
"control verbosity, whether to include metadata/text/links, and limits " |
|
"for characters and number of links." |
|
), |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
) |
|
|
|
|
|
concise_interface = gr.Interface( |
|
fn=Search_Concise, |
|
inputs=[ |
|
gr.Textbox(label="Query", placeholder="topic OR site:example.com"), |
|
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"), |
|
gr.Checkbox(value=False, label="Include snippets (adds tokens)"), |
|
gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"), |
|
gr.Checkbox(value=True, label="Dedupe by domain"), |
|
gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"), |
|
], |
|
outputs=gr.Textbox(label="Results (JSONL)", interactive=False), |
|
title="DuckDuckGo Search (Concise)", |
|
description="Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.", |
|
api_description=( |
|
"Run a DuckDuckGo search and return newline-delimited JSON with short keys: " |
|
"t=title, u=url, optional s=snippet. Options control result count, " |
|
"snippet inclusion and length, domain deduping, and title length." |
|
), |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
submit_btn="Search", |
|
) |
|
|
|
|
|
websearch_interface = gr.Interface( |
|
fn=Search_Structured, |
|
inputs=[ |
|
gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"), |
|
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"), |
|
], |
|
outputs=gr.JSON(label="Search results"), |
|
title="DuckDuckGo Search (Structured)", |
|
description="Search the web using DuckDuckGo; returns snippet, title, and link.", |
|
api_description=( |
|
"Run a DuckDuckGo web search and return a list of objects with keys: " |
|
"snippet, title, and link. Configure the number of results." |
|
), |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
) |
|
|
|
|
|
unstructured_interface = gr.Interface( |
|
fn=Search_Raw, |
|
inputs=gr.Textbox(label="Enter Search Query"), |
|
outputs=gr.Textbox(label="Results", interactive=False), |
|
title="DuckDuckGo Search (Raw)", |
|
description="Returns the raw list of results (list[dict]) shown as text.", |
|
api_description=( |
|
"Run DuckDuckGo via the native client and return the raw list[dict] as " |
|
"provided by duckduckgo_search (fields like title, href/link, body/snippet)." |
|
), |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
submit_btn="Search", |
|
) |
|
|
|
|
|
sitemap_interface = gr.Interface( |
|
fn=Generate_Sitemap, |
|
inputs=[ |
|
gr.Textbox( |
|
label="Website URL", |
|
placeholder="https://example.com or example.com" |
|
), |
|
gr.Slider( |
|
minimum=0, |
|
maximum=1000, |
|
value=0, |
|
step=1, |
|
label="Max links per domain (0 = show all)" |
|
), |
|
], |
|
outputs=gr.Markdown(label="Sitemap (Markdown)"), |
|
title="Generate Sitemap", |
|
description="Group links by Internal/External domains; optionally limit links per domain.", |
|
api_description=( |
|
"Scan a page and build a grouped sitemap of anchor links. Links are grouped as " |
|
"Internal or External (per domain). Set a per-domain cap; 0 shows all." |
|
), |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
submit_btn="Generate", |
|
) |
|
|
|
|
|
demo = gr.TabbedInterface( |
|
interface_list=[fetch_interface, concise_interface, websearch_interface, unstructured_interface, sitemap_interface], |
|
tab_names=[ |
|
"Fetch Webpage", |
|
"DuckDuckGo Search (Concise)", |
|
"DuckDuckGo Search (Structured)", |
|
"DuckDuckGo Search (Raw)", |
|
"Generate Sitemap", |
|
], |
|
title="Web MCP — Fetch, Search, and Sitemaps with customizable output modes.", |
|
theme="Nymbo/Nymbo_Theme", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|