Spaces:

Nymbo
/

Web

Running

App Files Files Community

Web / app.py

Nymbo

Update app.py

b923a7c verified 9 days ago

raw

history blame

16.5 kB

	# File: main/app.py
	# Purpose: One Space that offers three tools in one MCP server:
	# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
	# 2) Websearch — structured DuckDuckGo results (JSON via LangChain wrapper)
	# 3) DDG (Unstructured) — compact plain-text DuckDuckGo results for low token usage
	#
	# Notes:
	# - Launched with mcp_server=True so all functions are exposed as MCP tools.
	# - UI uses TabbedInterface: each tool has its own tab.
	# - Inline comments describe each section in plain language.

	from __future__ import annotations

	import re # (layman) used to tidy up whitespace
	from typing import List, Dict, Literal, Tuple

	import gradio as gr # (layman) the UI framework
	import requests # (layman) to download web pages
	from bs4 import BeautifulSoup # (layman) for parsing HTML
	from readability import Document # (layman) to isolate main readable content
	from urllib.parse import urljoin, urldefrag, urlparse # (layman) to fix/clean URLs

	# Structured DDG search (LangChain wrapper)
	from langchain_community.tools import DuckDuckGoSearchResults

	# Unstructured DDG search (lightweight direct client)
	from duckduckgo_search import DDGS


	# ==============================
	# Fetch: HTTP + extraction utils
	# ==============================

	def _http_get(url: str) -> requests.Response:
	"""
	(layman) Download the page politely with a short timeout and realistic headers.
	"""
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}
	return requests.get(url, headers=headers, timeout=15)


	def _normalize_whitespace(text: str) -> str:
	"""
	(layman) Squeeze extra spaces and blank lines to keep things compact.
	"""
	text = re.sub(r"[ \t\u00A0]+", " ", text)
	text = re.sub(r"\n\s\n\s\n+", "\n\n", text.strip())
	return text.strip()


	def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
	"""
	(layman) Cut text if it gets too long; return the text and whether we trimmed.
	"""
	if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
	return text, False
	return text[:max_chars].rstrip() + " …", True


	def _domain_of(url: str) -> str:
	"""
	(layman) Show a friendly site name like "example.com".
	"""
	try:
	return urlparse(url).netloc or ""
	except Exception:
	return ""


	def _meta(soup: BeautifulSoup, name: str) -> str \| None:
	tag = soup.find("meta", attrs={"name": name})
	return tag.get("content") if tag and tag.has_attr("content") else None


	def _og(soup: BeautifulSoup, prop: str) -> str \| None:
	tag = soup.find("meta", attrs={"property": prop})
	return tag.get("content") if tag and tag.has_attr("content") else None


	def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
	"""
	(layman) Pull the useful bits: title, description, site name, canonical URL, language, etc.
	"""
	meta: Dict[str, str] = {}

	# Title preference: <title> > og:title > twitter:title
	title_candidates = [
	(soup.title.string if soup.title and soup.title.string else None),
	_og(soup, "og:title"),
	_meta(soup, "twitter:title"),
	]
	meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")

	# Description preference: description > og:description > twitter:description
	desc_candidates = [
	_meta(soup, "description"),
	_og(soup, "og:description"),
	_meta(soup, "twitter:description"),
	]
	meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")

	# Canonical link (helps dedupe)
	link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
	meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""

	# Site name + language info if present
	meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
	html_tag = soup.find("html")
	meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""

	# Final URL + domain
	meta["fetched_url"] = final_url
	meta["domain"] = _domain_of(final_url)

	return meta


	def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
	"""
	(layman) Use Readability to isolate the main article and turn it into clean text.
	Returns (clean_text, soup_of_readable_html).
	"""
	# Simplified article HTML from Readability
	doc = Document(html)
	readable_html = doc.summary(html_partial=True)

	# Parse simplified HTML
	s = BeautifulSoup(readable_html, "lxml")

	# Remove noisy tags
	for sel in ["script", "style", "noscript", "iframe", "svg"]:
	for tag in s.select(sel):
	tag.decompose()

	# Keep paragraphs, list items, and subheadings for structure without bloat
	text_parts: List[str] = []
	for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
	chunk = p.get_text(" ", strip=True)
	if chunk:
	text_parts.append(chunk)

	clean_text = _normalize_whitespace("\n\n".join(text_parts))
	return clean_text, s


	def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
	"""
	(layman) Collect clean, unique, absolute links from the readable section only.
	"""
	seen = set()
	links: List[Tuple[str, str]] = []

	for a in readable_soup.find_all("a", href=True):
	href = a.get("href").strip()
	# Skip junk links we can't use
	if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
	continue

	# Resolve relative URLs, strip fragments (#…)
	absolute = urljoin(base_url, href)
	absolute, _ = urldefrag(absolute)

	if absolute in seen:
	continue
	seen.add(absolute)

	text = a.get_text(" ", strip=True)
	if len(text) > 120:
	text = text[:117] + "…"

	links.append((text or absolute, absolute))

	if len(links) >= max_links > 0:
	break

	return links


	def _format_markdown(
	meta: Dict[str, str],
	body: str,
	body_truncated: bool,
	links: List[Tuple[str, str]],
	include_text: bool,
	include_metadata: bool,
	include_links: bool,
	verbosity: str,
	) -> str:
	"""
	(layman) Assemble a compact Markdown summary with optional sections.
	"""
	lines: List[str] = []

	# Title header
	title = meta.get("title") or meta.get("domain") or "Untitled"
	lines.append(f"# {title}")

	# Metadata section (only show what exists)
	if include_metadata:
	md: List[str] = []
	if meta.get("description"):
	md.append(f"- Description: {meta['description']}")
	if meta.get("site_name"):
	md.append(f"- Site: {meta['site_name']}")
	if meta.get("canonical"):
	md.append(f"- Canonical: {meta['canonical']}")
	if meta.get("lang"):
	md.append(f"- Language: {meta['lang']}")
	if meta.get("fetched_url"):
	md.append(f"- Fetched From: {meta['fetched_url']}")
	if md:
	lines.append("## Metadata")
	lines.extend(md)

	# Body text
	if include_text and body:
	if verbosity == "Brief":
	brief, was_more = _truncate(body, 800)
	lines.append("## Text")
	lines.append(brief)
	if was_more or body_truncated:
	lines.append("\n> (Trimmed for brevity)")
	else:
	lines.append("## Text")
	lines.append(body)
	if body_truncated:
	lines.append("\n> (Trimmed for brevity)")

	# Links section
	if include_links and links:
	lines.append(f"## Links ({len(links)})")
	for text, url in links:
	lines.append(f"- [{text}]({url})")

	return "\n\n".join(lines).strip()


	def extract_relevant( # <-- MCP tool #1
	url: str,
	verbosity: str = "Standard",
	include_metadata: bool = True,
	include_text: bool = True,
	include_links: bool = True,
	max_chars: int = 3000,
	max_links: int = 20,
	) -> str:
	"""
	(layman) Given a URL, return a tight Markdown summary: title, key metadata, readable text, and links.
	"""
	if not url or not url.strip():
	return "Please enter a valid URL."

	try:
	resp = _http_get(url)
	resp.raise_for_status()
	except requests.exceptions.RequestException as e:
	return f"An error occurred: {e}"

	final_url = str(resp.url)
	ctype = resp.headers.get("Content-Type", "")
	if "html" not in ctype.lower():
	return f"Unsupported content type for extraction: {ctype or 'unknown'}"

	# Decode to text
	resp.encoding = resp.encoding or resp.apparent_encoding
	html = resp.text

	# Full-page soup for metadata
	full_soup = BeautifulSoup(html, "lxml")
	meta = _extract_metadata(full_soup, final_url)

	# Readable content
	body_text, readable_soup = _extract_main_text(html)
	if not body_text:
	# Fallback to "whole-page text" if Readability found nothing
	fallback_text = full_soup.get_text(" ", strip=True)
	body_text = _normalize_whitespace(fallback_text)

	# Verbosity presets (we keep the smaller of preset vs. user cap)
	preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
	target_cap = preset_caps.get(verbosity, 3000)
	cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
	body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)

	# Extract links from the simplified content only
	links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)

	# Final compact Markdown
	md = _format_markdown(
	meta=meta,
	body=body_text,
	body_truncated=truncated,
	links=links,
	include_text=include_text,
	include_metadata=include_metadata,
	include_links=include_links,
	verbosity=verbosity,
	)
	return md or "No content could be extracted."


	# ========================================
	# Websearch (Structured): DuckDuckGo (JSON)
	# ========================================

	def web_search( # <-- MCP tool #2
	input_query: str,
	max_results: int = 5,
	) -> List[Dict[Literal["snippet", "title", "link"], str]]:
	"""
	(layman) Run a DuckDuckGo search and return a list of {snippet, title, link}.
	"""
	if not input_query or not input_query.strip():
	return []

	# Create the search tool (LangChain community wrapper)
	search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)

	# Run the search and return results as a list of dicts
	results = search.invoke(input_query)
	return results


	# ===================================================
	# DDG (Unstructured): compact plain-text, low tokens
	# ===================================================

	def web_search_unstructured( # <-- MCP tool #3
	input_query: str,
	max_results: int = 5,
	style: Literal["urls", "titles+urls", "titles+urls+snippets"] = "titles+urls",
	snippet_max_chars: int = 160,
	) -> str:
	"""
	(layman) A lightweight DDG search that returns a plain-text list.
	- Fewer tokens than JSON; great for quick scanning or piping into LLM prompts.
	- 'style' controls how much text we include per line.
	"""
	if not input_query or not input_query.strip():
	return ""

	# (layman) Run the search using the lightweight DDG client
	with DDGS() as ddgs:
	results = list(ddgs.text(input_query, max_results=max_results))

	# (layman) Normalize fields because DDG library keys can vary by version
	lines: List[str] = []
	for r in results:
	title = (r.get("title") or "").strip()
	url = (r.get("href") or r.get("link") or r.get("url") or "").strip()
	snippet = (r.get("body") or r.get("snippet") or "").strip()

	# (layman) Truncate snippet to keep output tight
	if snippet_max_chars and len(snippet) > snippet_max_chars:
	snippet = snippet[:snippet_max_chars - 1].rstrip() + "…"

	# (layman) Build each line according to the chosen style
	if style == "urls":
	if url:
	lines.append(url)
	elif style == "titles+urls":
	if title and url:
	lines.append(f"{title} — {url}")
	elif url:
	lines.append(url)
	elif title:
	lines.append(title)
	else: # titles+urls+snippets
	if title and url and snippet:
	lines.append(f"{title} — {url}\n {snippet}")
	elif title and url:
	lines.append(f"{title} — {url}")
	elif url:
	# (layman) If only URL is available, still show it
	if snippet:
	lines.append(f"{url}\n {snippet}")
	else:
	lines.append(url)
	elif title:
	if snippet:
	lines.append(f"{title}\n {snippet}")
	else:
	lines.append(title)

	# (layman) Join lines with newlines to form a compact text block
	return "\n".join(lines).strip()


	# =====================
	# UI: three-tab interface
	# =====================

	# --- Fetch tab (compact controllable extraction) ---
	fetch_interface = gr.Interface(
	fn=extract_relevant, # (layman) connect the function to the UI
	inputs=[
	gr.Textbox(label="URL", placeholder="https://example.com/article"),
	gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
	gr.Checkbox(value=True, label="Include Metadata"),
	gr.Checkbox(value=True, label="Include Main Text"),
	gr.Checkbox(value=True, label="Include Links"),
	gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
	gr.Slider(0, 100, value=20, step=1, label="Max Links"),
	],
	outputs=gr.Markdown(label="Extracted Summary"),
	title="Fetch — Clean Extract",
	description="Extract title, key metadata, readable text, and links. No noisy HTML.",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme",
	)

	# --- Websearch tab (structured JSON) ---
	websearch_interface = gr.Interface(
	fn=web_search, # (layman) connect the function to the UI
	inputs=[
	gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
	gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
	],
	outputs=gr.JSON(label="Search results"),
	title="Websearch — DuckDuckGo (JSON)",
	description="Search the web using DuckDuckGo; returns snippet, title, and link as JSON.",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme",
	)

	# --- DDG (Unstructured) tab (plain text, low tokens) ---
	unstructured_interface = gr.Interface(
	fn=web_search_unstructured, # (layman) connect the function to the UI
	inputs=[
	gr.Textbox(value="", label="Search query", placeholder="concise keywords"),
	gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
	gr.Dropdown(
	label="Output style",
	choices=["urls", "titles+urls", "titles+urls+snippets"],
	value="titles+urls",
	info="Plain text list; choose how much detail to include."
	),
	gr.Slider(
	minimum=40, maximum=400, value=160, step=10,
	label="Snippet max chars",
	info="Truncate snippet length to keep token usage low."
	),
	],
	outputs=gr.Textbox(label="Results (plain text)", interactive=False),
	title="DDG — Unstructured (Compact)",
	description="Outputs a plain-text list (great for low-token prompts).",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme",
	)

	# --- Combine all three into a single app with tabs ---
	demo = gr.TabbedInterface(
	interface_list=[fetch_interface, websearch_interface, unstructured_interface],
	tab_names=["Fetch", "Websearch", "DDG (Unstructured)"],
	)

	# Launch the UI and expose all functions as MCP tools in one server
	if __name__ == "__main__":
	demo.launch(mcp_server=True)