Web / app.py
Nymbo's picture
Update app.py
b923a7c verified
raw
history blame
16.5 kB
# File: main/app.py
# Purpose: One Space that offers three tools in one MCP server:
# 1) Fetch β€” extract relevant page content (title, metadata, clean text, hyperlinks)
# 2) Websearch β€” structured DuckDuckGo results (JSON via LangChain wrapper)
# 3) DDG (Unstructured) β€” compact plain-text DuckDuckGo results for low token usage
#
# Notes:
# - Launched with mcp_server=True so all functions are exposed as MCP tools.
# - UI uses TabbedInterface: each tool has its own tab.
# - Inline comments describe each section in plain language.
from __future__ import annotations
import re # (layman) used to tidy up whitespace
from typing import List, Dict, Literal, Tuple
import gradio as gr # (layman) the UI framework
import requests # (layman) to download web pages
from bs4 import BeautifulSoup # (layman) for parsing HTML
from readability import Document # (layman) to isolate main readable content
from urllib.parse import urljoin, urldefrag, urlparse # (layman) to fix/clean URLs
# Structured DDG search (LangChain wrapper)
from langchain_community.tools import DuckDuckGoSearchResults
# Unstructured DDG search (lightweight direct client)
from duckduckgo_search import DDGS
# ==============================
# Fetch: HTTP + extraction utils
# ==============================
def _http_get(url: str) -> requests.Response:
"""
(layman) Download the page politely with a short timeout and realistic headers.
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
return requests.get(url, headers=headers, timeout=15)
def _normalize_whitespace(text: str) -> str:
"""
(layman) Squeeze extra spaces and blank lines to keep things compact.
"""
text = re.sub(r"[ \t\u00A0]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
return text.strip()
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
"""
(layman) Cut text if it gets too long; return the text and whether we trimmed.
"""
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
return text, False
return text[:max_chars].rstrip() + " …", True
def _domain_of(url: str) -> str:
"""
(layman) Show a friendly site name like "example.com".
"""
try:
return urlparse(url).netloc or ""
except Exception:
return ""
def _meta(soup: BeautifulSoup, name: str) -> str | None:
tag = soup.find("meta", attrs={"name": name})
return tag.get("content") if tag and tag.has_attr("content") else None
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", attrs={"property": prop})
return tag.get("content") if tag and tag.has_attr("content") else None
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
"""
(layman) Pull the useful bits: title, description, site name, canonical URL, language, etc.
"""
meta: Dict[str, str] = {}
# Title preference: <title> > og:title > twitter:title
title_candidates = [
(soup.title.string if soup.title and soup.title.string else None),
_og(soup, "og:title"),
_meta(soup, "twitter:title"),
]
meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
# Description preference: description > og:description > twitter:description
desc_candidates = [
_meta(soup, "description"),
_og(soup, "og:description"),
_meta(soup, "twitter:description"),
]
meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
# Canonical link (helps dedupe)
link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
# Site name + language info if present
meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
html_tag = soup.find("html")
meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
# Final URL + domain
meta["fetched_url"] = final_url
meta["domain"] = _domain_of(final_url)
return meta
def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
"""
(layman) Use Readability to isolate the main article and turn it into clean text.
Returns (clean_text, soup_of_readable_html).
"""
# Simplified article HTML from Readability
doc = Document(html)
readable_html = doc.summary(html_partial=True)
# Parse simplified HTML
s = BeautifulSoup(readable_html, "lxml")
# Remove noisy tags
for sel in ["script", "style", "noscript", "iframe", "svg"]:
for tag in s.select(sel):
tag.decompose()
# Keep paragraphs, list items, and subheadings for structure without bloat
text_parts: List[str] = []
for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
chunk = p.get_text(" ", strip=True)
if chunk:
text_parts.append(chunk)
clean_text = _normalize_whitespace("\n\n".join(text_parts))
return clean_text, s
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
"""
(layman) Collect clean, unique, absolute links from the readable section only.
"""
seen = set()
links: List[Tuple[str, str]] = []
for a in readable_soup.find_all("a", href=True):
href = a.get("href").strip()
# Skip junk links we can't use
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
continue
# Resolve relative URLs, strip fragments (#…)
absolute = urljoin(base_url, href)
absolute, _ = urldefrag(absolute)
if absolute in seen:
continue
seen.add(absolute)
text = a.get_text(" ", strip=True)
if len(text) > 120:
text = text[:117] + "…"
links.append((text or absolute, absolute))
if len(links) >= max_links > 0:
break
return links
def _format_markdown(
meta: Dict[str, str],
body: str,
body_truncated: bool,
links: List[Tuple[str, str]],
include_text: bool,
include_metadata: bool,
include_links: bool,
verbosity: str,
) -> str:
"""
(layman) Assemble a compact Markdown summary with optional sections.
"""
lines: List[str] = []
# Title header
title = meta.get("title") or meta.get("domain") or "Untitled"
lines.append(f"# {title}")
# Metadata section (only show what exists)
if include_metadata:
md: List[str] = []
if meta.get("description"):
md.append(f"- **Description:** {meta['description']}")
if meta.get("site_name"):
md.append(f"- **Site:** {meta['site_name']}")
if meta.get("canonical"):
md.append(f"- **Canonical:** {meta['canonical']}")
if meta.get("lang"):
md.append(f"- **Language:** {meta['lang']}")
if meta.get("fetched_url"):
md.append(f"- **Fetched From:** {meta['fetched_url']}")
if md:
lines.append("## Metadata")
lines.extend(md)
# Body text
if include_text and body:
if verbosity == "Brief":
brief, was_more = _truncate(body, 800)
lines.append("## Text")
lines.append(brief)
if was_more or body_truncated:
lines.append("\n> (Trimmed for brevity)")
else:
lines.append("## Text")
lines.append(body)
if body_truncated:
lines.append("\n> (Trimmed for brevity)")
# Links section
if include_links and links:
lines.append(f"## Links ({len(links)})")
for text, url in links:
lines.append(f"- [{text}]({url})")
return "\n\n".join(lines).strip()
def extract_relevant( # <-- MCP tool #1
url: str,
verbosity: str = "Standard",
include_metadata: bool = True,
include_text: bool = True,
include_links: bool = True,
max_chars: int = 3000,
max_links: int = 20,
) -> str:
"""
(layman) Given a URL, return a tight Markdown summary: title, key metadata, readable text, and links.
"""
if not url or not url.strip():
return "Please enter a valid URL."
try:
resp = _http_get(url)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
return f"An error occurred: {e}"
final_url = str(resp.url)
ctype = resp.headers.get("Content-Type", "")
if "html" not in ctype.lower():
return f"Unsupported content type for extraction: {ctype or 'unknown'}"
# Decode to text
resp.encoding = resp.encoding or resp.apparent_encoding
html = resp.text
# Full-page soup for metadata
full_soup = BeautifulSoup(html, "lxml")
meta = _extract_metadata(full_soup, final_url)
# Readable content
body_text, readable_soup = _extract_main_text(html)
if not body_text:
# Fallback to "whole-page text" if Readability found nothing
fallback_text = full_soup.get_text(" ", strip=True)
body_text = _normalize_whitespace(fallback_text)
# Verbosity presets (we keep the smaller of preset vs. user cap)
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
target_cap = preset_caps.get(verbosity, 3000)
cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
# Extract links from the simplified content only
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
# Final compact Markdown
md = _format_markdown(
meta=meta,
body=body_text,
body_truncated=truncated,
links=links,
include_text=include_text,
include_metadata=include_metadata,
include_links=include_links,
verbosity=verbosity,
)
return md or "No content could be extracted."
# ========================================
# Websearch (Structured): DuckDuckGo (JSON)
# ========================================
def web_search( # <-- MCP tool #2
input_query: str,
max_results: int = 5,
) -> List[Dict[Literal["snippet", "title", "link"], str]]:
"""
(layman) Run a DuckDuckGo search and return a list of {snippet, title, link}.
"""
if not input_query or not input_query.strip():
return []
# Create the search tool (LangChain community wrapper)
search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)
# Run the search and return results as a list of dicts
results = search.invoke(input_query)
return results
# ===================================================
# DDG (Unstructured): compact plain-text, low tokens
# ===================================================
def web_search_unstructured( # <-- MCP tool #3
input_query: str,
max_results: int = 5,
style: Literal["urls", "titles+urls", "titles+urls+snippets"] = "titles+urls",
snippet_max_chars: int = 160,
) -> str:
"""
(layman) A lightweight DDG search that returns a plain-text list.
- Fewer tokens than JSON; great for quick scanning or piping into LLM prompts.
- 'style' controls how much text we include per line.
"""
if not input_query or not input_query.strip():
return ""
# (layman) Run the search using the lightweight DDG client
with DDGS() as ddgs:
results = list(ddgs.text(input_query, max_results=max_results))
# (layman) Normalize fields because DDG library keys can vary by version
lines: List[str] = []
for r in results:
title = (r.get("title") or "").strip()
url = (r.get("href") or r.get("link") or r.get("url") or "").strip()
snippet = (r.get("body") or r.get("snippet") or "").strip()
# (layman) Truncate snippet to keep output tight
if snippet_max_chars and len(snippet) > snippet_max_chars:
snippet = snippet[:snippet_max_chars - 1].rstrip() + "…"
# (layman) Build each line according to the chosen style
if style == "urls":
if url:
lines.append(url)
elif style == "titles+urls":
if title and url:
lines.append(f"{title} β€” {url}")
elif url:
lines.append(url)
elif title:
lines.append(title)
else: # titles+urls+snippets
if title and url and snippet:
lines.append(f"{title} β€” {url}\n {snippet}")
elif title and url:
lines.append(f"{title} β€” {url}")
elif url:
# (layman) If only URL is available, still show it
if snippet:
lines.append(f"{url}\n {snippet}")
else:
lines.append(url)
elif title:
if snippet:
lines.append(f"{title}\n {snippet}")
else:
lines.append(title)
# (layman) Join lines with newlines to form a compact text block
return "\n".join(lines).strip()
# =====================
# UI: three-tab interface
# =====================
# --- Fetch tab (compact controllable extraction) ---
fetch_interface = gr.Interface(
fn=extract_relevant, # (layman) connect the function to the UI
inputs=[
gr.Textbox(label="URL", placeholder="https://example.com/article"),
gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
gr.Checkbox(value=True, label="Include Metadata"),
gr.Checkbox(value=True, label="Include Main Text"),
gr.Checkbox(value=True, label="Include Links"),
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
gr.Slider(0, 100, value=20, step=1, label="Max Links"),
],
outputs=gr.Markdown(label="Extracted Summary"),
title="Fetch β€” Clean Extract",
description="Extract title, key metadata, readable text, and links. No noisy HTML.",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
)
# --- Websearch tab (structured JSON) ---
websearch_interface = gr.Interface(
fn=web_search, # (layman) connect the function to the UI
inputs=[
gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
],
outputs=gr.JSON(label="Search results"),
title="Websearch β€” DuckDuckGo (JSON)",
description="Search the web using DuckDuckGo; returns snippet, title, and link as JSON.",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
)
# --- DDG (Unstructured) tab (plain text, low tokens) ---
unstructured_interface = gr.Interface(
fn=web_search_unstructured, # (layman) connect the function to the UI
inputs=[
gr.Textbox(value="", label="Search query", placeholder="concise keywords"),
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
gr.Dropdown(
label="Output style",
choices=["urls", "titles+urls", "titles+urls+snippets"],
value="titles+urls",
info="Plain text list; choose how much detail to include."
),
gr.Slider(
minimum=40, maximum=400, value=160, step=10,
label="Snippet max chars",
info="Truncate snippet length to keep token usage low."
),
],
outputs=gr.Textbox(label="Results (plain text)", interactive=False),
title="DDG β€” Unstructured (Compact)",
description="Outputs a plain-text list (great for low-token prompts).",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
)
# --- Combine all three into a single app with tabs ---
demo = gr.TabbedInterface(
interface_list=[fetch_interface, websearch_interface, unstructured_interface],
tab_names=["Fetch", "Websearch", "DDG (Unstructured)"],
)
# Launch the UI and expose all functions as MCP tools in one server
if __name__ == "__main__":
demo.launch(mcp_server=True)