Spaces:

Nymbo
/

Web

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

a655b89

verified ·

1 Parent(s): 8d4d649

Create app.py

Browse files

Files changed (1) hide show

app.py +371 -0

app.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# File: main/app.py
+# Purpose: One Space that offers two tools:
+#          1) Fetch: extract relevant page content (title, metadata, clean text, hyperlinks)
+#          2) Websearch: DuckDuckGo web search
+#
+# Notes:
+# - Launched with mcp_server=True so both functions are available as MCP tools.
+# - UI uses TabbedInterface so you can use each tool from its own tab.
+# - Inline comments explain each section in plain language.
+from __future__ import annotations
+import re                                         # (layman) used to tidy up whitespace
+from typing import List, Dict, Literal, Tuple
+import gradio as gr                               # (layman) the UI framework
+import requests                                   # (layman) to download web pages
+from bs4 import BeautifulSoup                     # (layman) for parsing HTML
+from readability import Document                  # (layman) to isolate main readable content
+from urllib.parse import urljoin, urldefrag, urlparse  # (layman) to fix/clean URLs
+# DuckDuckGo via LangChain community tool
+from langchain_community.tools import DuckDuckGoSearchResults
+# ==============================
+# Fetch: HTTP + extraction utils
+# ==============================
+def _http_get(url: str) -> requests.Response:
+    """
+    (layman) Download the page politely with a short timeout and realistic headers.
+    """
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    }
+    return requests.get(url, headers=headers, timeout=15)
+def _normalize_whitespace(text: str) -> str:
+    """
+    (layman) Squeeze extra spaces and blank lines to keep things compact.
+    """
+    text = re.sub(r"[ \t\u00A0]+", " ", text)
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
+    return text.strip()
+def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
+    """
+    (layman) Cut text if it gets too long; return the text and whether we trimmed.
+    """
+    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
+        return text, False
+    return text[:max_chars].rstrip() + " …", True
+def _domain_of(url: str) -> str:
+    """
+    (layman) Show a friendly site name like "example.com".
+    """
+    try:
+        return urlparse(url).netloc or ""
+    except Exception:
+        return ""
+def _meta(soup: BeautifulSoup, name: str) -> str | None:
+    tag = soup.find("meta", attrs={"name": name})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _og(soup: BeautifulSoup, prop: str) -> str | None:
+    tag = soup.find("meta", attrs={"property": prop})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
+    """
+    (layman) Pull the useful bits: title, description, site name, canonical URL, language, etc.
+    """
+    meta: Dict[str, str] = {}
+    # Title preference: <title> > og:title > twitter:title
+    title_candidates = [
+        (soup.title.string if soup.title and soup.title.string else None),
+        _og(soup, "og:title"),
+        _meta(soup, "twitter:title"),
+    ]
+    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
+    # Description preference: description > og:description > twitter:description
+    desc_candidates = [
+        _meta(soup, "description"),
+        _og(soup, "og:description"),
+        _meta(soup, "twitter:description"),
+    ]
+    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
+    # Canonical link (helps dedupe)
+    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
+    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
+    # Site name + language info if present
+    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
+    html_tag = soup.find("html")
+    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
+    # Final URL + domain
+    meta["fetched_url"] = final_url
+    meta["domain"] = _domain_of(final_url)
+    return meta
+def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
+    """
+    (layman) Use Readability to isolate the main article and turn it into clean text.
+    Returns (clean_text, soup_of_readable_html).
+    """
+    # Simplified article HTML from Readability
+    doc = Document(html)
+    readable_html = doc.summary(html_partial=True)
+    # Parse simplified HTML
+    s = BeautifulSoup(readable_html, "lxml")
+    # Remove noisy tags
+    for sel in ["script", "style", "noscript", "iframe", "svg"]:
+        for tag in s.select(sel):
+            tag.decompose()
+    # Keep paragraphs, list items, and subheadings for structure without bloat
+    text_parts: List[str] = []
+    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
+        chunk = p.get_text(" ", strip=True)
+        if chunk:
+            text_parts.append(chunk)
+    clean_text = _normalize_whitespace("\n\n".join(text_parts))
+    return clean_text, s
+def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
+    """
+    (layman) Collect clean, unique, absolute links from the readable section only.
+    """
+    seen = set()
+    links: List[Tuple[str, str]] = []
+    for a in readable_soup.find_all("a", href=True):
+        href = a.get("href").strip()
+        # Skip junk links we can't use
+        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
+            continue
+        # Resolve relative URLs, strip fragments (#…)
+        absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        if absolute in seen:
+            continue
+        seen.add(absolute)
+        text = a.get_text(" ", strip=True)
+        if len(text) > 120:
+            text = text[:117] + "…"
+        links.append((text or absolute, absolute))
+        if len(links) >= max_links > 0:
+            break
+    return links
+def _format_markdown(
+    meta: Dict[str, str],
+    body: str,
+    body_truncated: bool,
+    links: List[Tuple[str, str]],
+    include_text: bool,
+    include_metadata: bool,
+    include_links: bool,
+    verbosity: str,
+) -> str:
+    """
+    (layman) Assemble a compact Markdown summary with optional sections.
+    """
+    lines: List[str] = []
+    # Title header
+    title = meta.get("title") or meta.get("domain") or "Untitled"
+    lines.append(f"# {title}")
+    # Metadata section (only show what exists)
+    if include_metadata:
+        md: List[str] = []
+        if meta.get("description"):
+            md.append(f"- **Description:** {meta['description']}")
+        if meta.get("site_name"):
+            md.append(f"- **Site:** {meta['site_name']}")
+        if meta.get("canonical"):
+            md.append(f"- **Canonical:** {meta['canonical']}")
+        if meta.get("lang"):
+            md.append(f"- **Language:** {meta['lang']}")
+        if meta.get("fetched_url"):
+            md.append(f"- **Fetched From:** {meta['fetched_url']}")
+        if md:
+            lines.append("## Metadata")
+            lines.extend(md)
+    # Body text
+    if include_text and body:
+        if verbosity == "Brief":
+            brief, was_more = _truncate(body, 800)
+            lines.append("## Text")
+            lines.append(brief)
+            if was_more or body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+        else:
+            lines.append("## Text")
+            lines.append(body)
+            if body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+    # Links section
+    if include_links and links:
+        lines.append(f"## Links ({len(links)})")
+        for text, url in links:
+            lines.append(f"- [{text}]({url})")
+    return "\n\n".join(lines).strip()
+def extract_relevant(  # <-- MCP tool #1
+    url: str,
+    verbosity: str = "Standard",
+    include_metadata: bool = True,
+    include_text: bool = True,
+    include_links: bool = True,
+    max_chars: int = 3000,
+    max_links: int = 20,
+) -> str:
+    """
+    (layman) Given a URL, return a tight Markdown summary: title, key metadata, readable text, and links.
+    """
+    if not url or not url.strip():
+        return "Please enter a valid URL."
+    try:
+        resp = _http_get(url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
+    final_url = str(resp.url)
+    ctype = resp.headers.get("Content-Type", "")
+    if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
+    # Decode to text
+    resp.encoding = resp.encoding or resp.apparent_encoding
+    html = resp.text
+    # Full-page soup for metadata
+    full_soup = BeautifulSoup(html, "lxml")
+    meta = _extract_metadata(full_soup, final_url)
+    # Readable content
+    body_text, readable_soup = _extract_main_text(html)
+    if not body_text:
+        # Fallback to "whole-page text" if Readability found nothing
+        fallback_text = full_soup.get_text(" ", strip=True)
+        body_text = _normalize_whitespace(fallback_text)
+    # Verbosity presets (we keep the smaller of preset vs. user cap)
+    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
+    target_cap = preset_caps.get(verbosity, 3000)
+    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
+    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
+    # Extract links from the simplified content only
+    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
+    # Final compact Markdown
+    md = _format_markdown(
+        meta=meta,
+        body=body_text,
+        body_truncated=truncated,
+        links=links,
+        include_text=include_text,
+        include_metadata=include_metadata,
+        include_links=include_links,
+        verbosity=verbosity,
+    )
+    return md or "No content could be extracted."
+# ==========================
+# Websearch: DuckDuckGo tool
+# ==========================
+def web_search(  # <-- MCP tool #2
+    input_query: str,
+    max_results: int = 5,
+) -> List[Dict[Literal["snippet", "title", "link"], str]]:
+    """
+    (layman) Run a DuckDuckGo search and return a list of {snippet, title, link}.
+    """
+    if not input_query or not input_query.strip():
+        return []
+    # Create the search tool (LangChain community wrapper)
+    search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)
+    # Run the search and return results as a list of dicts
+    results = search.invoke(input_query)
+    return results
+# =====================
+# UI: two-tab interface
+# =====================
+# --- Fetch tab (compact controllable extraction) ---
+fetch_interface = gr.Interface(
+    fn=extract_relevant,  # (layman) connect the function to the UI
+    inputs=[
+        gr.Textbox(label="URL", placeholder="https://example.com/article"),
+        gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
+        gr.Checkbox(value=True, label="Include Metadata"),
+        gr.Checkbox(value=True, label="Include Main Text"),
+        gr.Checkbox(value=True, label="Include Links"),
+        gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
+        gr.Slider(0, 100, value=20, step=1, label="Max Links"),
+    ],
+    outputs=gr.Markdown(label="Extracted Summary"),
+    title="Fetch — Clean Extract",
+    description="Extract title, key metadata, readable text, and links. No noisy HTML.",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
+# --- Websearch tab (DuckDuckGo) ---
+websearch_interface = gr.Interface(
+    fn=web_search,  # (layman) connect the function to the UI
+    inputs=[
+        gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
+        gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
+    ],
+    outputs=gr.JSON(label="Search results"),
+    title="Websearch — DuckDuckGo",
+    description="Search the web using DuckDuckGo; returns snippet, title, and link.",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
+# --- Combine both into a single app with tabs ---
+demo = gr.TabbedInterface(
+    interface_list=[fetch_interface, websearch_interface],
+    tab_names=["Fetch", "Websearch"],
+    title="Web MCP — Fetch + Websearch",
+    theme="Nymbo/Nymbo_Theme",
+)
+# Launch the UI and expose both functions as MCP tools in one server
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)