Spaces:

Nymbo
/

Web

Running

App Files Files Community

Nymbo commited on 6 days ago

Commit

645af7f

verified ·

1 Parent(s): 4fe48d6

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -61

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 # File: main/app.py
-# Purpose: One Space that offers four tools/tabs:
 #   1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
 #   2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
 #   3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
 #   4) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
 from __future__ import annotations
 import re
 import json
 from typing import List, Dict, Literal, Tuple
 import gradio as gr
@@ -27,6 +29,7 @@ from duckduckgo_search import DDGS
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
     """
     headers = {
         "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
@@ -39,6 +42,7 @@ def _http_get(url: str) -> requests.Response:
 def _normalize_whitespace(text: str) -> str:
     """
     Squeeze extra spaces and blank lines to keep things compact.
     """
     text = re.sub(r"[ \t\u00A0]+", " ", text)
     text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
@@ -48,6 +52,7 @@ def _normalize_whitespace(text: str) -> str:
 def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
     """
     Cut text if it gets too long; return the text and whether we trimmed.
     """
     if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
         return text, False
@@ -57,6 +62,7 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
 def _shorten(text: str, limit: int) -> str:
     """
     Hard cap a string with an ellipsis to keep tokens small.
     """
     if limit <= 0 or len(text) <= limit:
         return text
@@ -66,6 +72,7 @@ def _shorten(text: str, limit: int) -> str:
 def _domain_of(url: str) -> str:
     """
     Show a friendly site name like "example.com".
     """
     try:
         return urlparse(url).netloc or ""
@@ -86,6 +93,7 @@ def _og(soup: BeautifulSoup, prop: str) -> str | None:
 def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
     """
     Pull the useful bits: title, description, site name, canonical URL, language, etc.
     """
     meta: Dict[str, str] = {}
@@ -125,6 +133,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     """
     Use Readability to isolate the main article and turn it into clean text.
     Returns (clean_text, soup_of_readable_html).
     """
     # Simplified article HTML from Readability
     doc = Document(html)
@@ -152,6 +161,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
 def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
     """
     Collect clean, unique, absolute links from the readable section only.
     """
     seen = set()
     links: List[Tuple[str, str]] = []
@@ -194,6 +204,7 @@ def _format_markdown(
 ) -> str:
     """
     Assemble a compact Markdown summary with optional sections.
     """
     lines: List[str] = []
@@ -254,25 +265,7 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     Fetch a web page and return a compact Markdown summary that includes title, key
     metadata, readable main text, and outbound links.
-    Args:
-        url (str): The HTTP/HTTPS URL to fetch. Must be publicly reachable.
-        verbosity (str): Controls body length. One of: "Brief", "Standard", or "Full".
-            - Brief ≈ up to 1,200 chars
-            - Standard ≈ up to 3,000 chars
-            - Full = no cap (still limited by `max_chars` if smaller)
-        include_metadata (bool): If True, include a Metadata section with description,
-            site name, canonical URL, language, and fetched URL.
-        include_text (bool): If True, include the extracted readable body text.
-        include_links (bool): If True, include a list of outbound links found in the
-            readable section only (deduped and fragment-stripped).
-        max_chars (int): Hard cap for body text length. Numeric value between 400 and
-            12000. The effective cap is the smaller of this value and the preset based
-            on `verbosity`.
-        max_links (int): Maximum number of links to include. Numeric value between 0 and 100.
-    Returns:
-        str: Markdown string containing the extracted summary. If the page cannot be
-        fetched or parsed, a short error message is returned instead.
     """
     if not url or not url.strip():
         return "Please enter a valid URL."
@@ -336,17 +329,7 @@ def Search_Structured(  # <-- MCP tool #2 (Structured DDG)
 ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
     """
     Run a DuckDuckGo search and return structured results as a list of dictionaries.
-    Args:
-        input_query (str): The search query. Supports operators like site:, quotes,
-            and boolean keywords.
-        max_results (int): Number of results to return (1–20).
-    Returns:
-        List[Dict[Literal["snippet","title","link"], str]]: Each item contains:
-            - snippet: Short text snippet
-            - title: Result title
-            - link: Result URL
     """
     if not input_query or not input_query.strip():
         return []
@@ -369,13 +352,7 @@ def Search_Raw(  # <-- MCP tool #3 (Unstructured DDG)
     """
     Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
     raw Python list of dictionaries from the library.
-    Args:
-        query (str): The search query string.
-    Returns:
-        list[dict]: The unmodified objects returned by `DDGS().text(...)`, typically
-        containing keys like: title, href/link, body/snippet, source, etc.
     """
     if not query or not query.strip():
         return []
@@ -400,25 +377,8 @@ def Search_Concise(  # <-- MCP tool #4 (Concise DDG)
     Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
     minimize tokens.
-    Args:
-        query (str): The search query string.
-        max_results (int): Maximum number of results to retrieve (1–20).
-        include_snippets (bool): If True, include a shortened snippet per result under
-            key "s".
-        max_snippet_chars (int): Hard cap for snippet length when `include_snippets`
-            is True. Range 20–200.
-        dedupe_domains (bool): If True, only keep the first result per domain.
-        title_chars (int): Hard cap for the title length. Range 20–120.
-    Returns:
-        str: Newline-delimited JSON (JSONL). Each line is a compact JSON object with
-        short keys: "t" (title), "u" (URL), and optionally "s" (snippet).
-        Example lines:
-            {"t":"Example","u":"https://example.com/x"}
-            {"t":"Another…","u":"https://a.com/y","s":"Short snippet…"}
     """
     if not query or not query.strip():
         return ""
@@ -426,7 +386,6 @@ def Search_Concise(  # <-- MCP tool #4 (Concise DDG)
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
         return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
@@ -458,8 +417,133 @@ def Search_Concise(  # <-- MCP tool #4 (Concise DDG)
     return "\n".join(lines)
 # ======================
-# UI: four-tab interface
 # ======================
 # --- Fetch tab (compact controllable extraction) ---
@@ -545,19 +629,51 @@ concise_interface = gr.Interface(
     submit_btn="Search",
 )
 # --- Combine all into a single app with tabs ---
 demo = gr.TabbedInterface(
-    interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface],
     tab_names=[
         "Fetch Webpage",
         "DuckDuckGo Search (Structured)",
         "DuckDuckGo Search (Raw)",
         "DuckDuckGo Search (Concise)",
     ],
-    title="Web MCP — Fetch & DuckDuckGo search with customizable output modes.",
     theme="Nymbo/Nymbo_Theme",
 )
 # Launch the UI and expose all functions as MCP tools in one server
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

 # File: main/app.py
+# Purpose: One Space that offers five tools/tabs:
 #   1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
 #   2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
 #   3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
 #   4) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
+#   5) Generate Sitemap — LIMITED: grouped internal/external links with an optional per-domain cap (and a .md download)
 from __future__ import annotations
 import re
 import json
+import tempfile  # <-- used to create a downloadable .md for the sitemap
 from typing import List, Dict, Literal, Tuple
 import gradio as gr
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
+    (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
     headers = {
         "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
 def _normalize_whitespace(text: str) -> str:
     """
     Squeeze extra spaces and blank lines to keep things compact.
+    (Layman's terms: tidy up the text so it’s not full of weird spacing.)
     """
     text = re.sub(r"[ \t\u00A0]+", " ", text)
     text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
 def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
     """
     Cut text if it gets too long; return the text and whether we trimmed.
+    (Layman's terms: shorten long text and tell us if we had to cut it.)
     """
     if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
         return text, False
 def _shorten(text: str, limit: int) -> str:
     """
     Hard cap a string with an ellipsis to keep tokens small.
+    (Layman's terms: force a string to a max length with an ellipsis.)
     """
     if limit <= 0 or len(text) <= limit:
         return text
 def _domain_of(url: str) -> str:
     """
     Show a friendly site name like "example.com".
+    (Layman's terms: pull the website's domain.)
     """
     try:
         return urlparse(url).netloc or ""
 def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
     """
     Pull the useful bits: title, description, site name, canonical URL, language, etc.
+    (Layman's terms: gather page basics like title/description/address.)
     """
     meta: Dict[str, str] = {}
     """
     Use Readability to isolate the main article and turn it into clean text.
     Returns (clean_text, soup_of_readable_html).
+    (Layman's terms: find the real article text and clean it.)
     """
     # Simplified article HTML from Readability
     doc = Document(html)
 def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
     """
     Collect clean, unique, absolute links from the readable section only.
+    (Layman's terms: pull a tidy list of links from the article body.)
     """
     seen = set()
     links: List[Tuple[str, str]] = []
 ) -> str:
     """
     Assemble a compact Markdown summary with optional sections.
+    (Layman's terms: build the final markdown output with options.)
     """
     lines: List[str] = []
     Fetch a web page and return a compact Markdown summary that includes title, key
     metadata, readable main text, and outbound links.
+    (Layman's terms: summarize a page with clean text + useful details.)
     """
     if not url or not url.strip():
         return "Please enter a valid URL."
 ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
     """
     Run a DuckDuckGo search and return structured results as a list of dictionaries.
+    (Layman's terms: search DDG and get clean JSON objects.)
     """
     if not input_query or not input_query.strip():
         return []
     """
     Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
     raw Python list of dictionaries from the library.
+    (Layman's terms: search DDG and show exactly what the library returns.)
     """
     if not query or not query.strip():
         return []
     Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
     minimize tokens.
+    (Layman's terms: the tiniest useful search output possible.)
     """
     if not query or not query.strip():
         return ""
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
         return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
     return "\n".join(lines)
+# ============================================
+# Generate Sitemap: LIMITED (new MCP tool #5)
+# ============================================
+def Generate_Sitemap_Limited(
+    url: str,
+    max_links_per_domain: int = 0,
+) -> Tuple[str, str | None]:
+    """
+    Generate a grouped sitemap from all anchor links on a page, with an optional
+    per-domain cap. Returns (markdown, downloadable_file_path).
+    (Layman's terms: list all links on a page, grouped by Internal/External domain.
+    You can limit how many per domain; 0 means show all.)
+    """
+    # --- Basic validation & normalization ---
+    if not url or not url.strip():
+        return "Please enter a valid URL.", None
+    # If the user forgot the scheme, assume https
+    if not url.lower().startswith(("http://", "https://")):
+        url = "https://" + url.strip()
+    # --- Fetch the page safely ---
+    try:
+        resp = _http_get(url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        return f"Error fetching URL: {str(e)}", None
+    base_url = str(resp.url)  # follow redirects and use the final URL
+    content_type = resp.headers.get("Content-Type", "")
+    if "html" not in content_type.lower():
+        return "The provided URL does not appear to be an HTML page.", None
+    # --- Parse and collect links ---
+    soup = BeautifulSoup(resp.content, "lxml")  # fast, lenient HTML parsing
+    anchors = soup.find_all("a", href=True)
+    seen_urls: set[str] = set()
+    items: List[Dict[str, str]] = []
+    for a in anchors:
+        href = (a.get("href") or "").strip()
+        if not href:
+            continue
+        # Skip non-navigational/unsupported schemes
+        if href.startswith(("#", "javascript:", "mailto:", "tel:")):
+            continue
+        # Resolve relative links and strip fragments
+        absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        # Deduplicate and skip self
+        if absolute in seen_urls or absolute == base_url:
+            continue
+        seen_urls.add(absolute)
+        # Use link text if available; otherwise the URL itself
+        text = (a.get_text(" ", strip=True) or href).strip()
+        if len(text) > 100:
+            text = text[:100] + "..."
+        items.append({"text": text, "url": absolute})
+    if not items:
+        return "No links found on this page.", None
+    # --- Group by Internal vs External domains ---
+    base_netloc = urlparse(base_url).netloc
+    domain_groups: Dict[str, List[Dict[str, str]]] = {}
+    for it in items:
+        netloc = urlparse(it["url"]).netloc
+        key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
+        domain_groups.setdefault(key, []).append(it)
+    # --- Build Markdown with optional per-domain limit ---
+    total_links = len(items)
+    md_lines: List[str] = []
+    md_lines.append("# Sitemap")
+    md_lines.append(f"Base URL: {base_url}")
+    md_lines.append(f"Found {total_links} links:\n")
+    # Show Internal first, then external groups sorted by name
+    keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
+    for group_key in keys_sorted:
+        if group_key not in domain_groups:
+            continue
+        group_links = domain_groups[group_key]
+        md_lines.append(f"## {group_key}\n")
+        if max_links_per_domain and max_links_per_domain > 0:
+            links_to_show = group_links[:max_links_per_domain]
+            remaining = max(0, len(group_links) - max_links_per_domain)
+        else:
+            links_to_show = group_links
+            remaining = 0
+        for link in links_to_show:
+            md_lines.append(f"- [{link['text']}]({link['url']})")
+        if remaining > 0:
+            md_lines.append(f"- ... and {remaining} more links")
+        md_lines.append("")  # blank line after each group
+    sitemap_md = "\n".join(md_lines).strip()
+    # --- Save to a temp .md so the UI can offer a download ---
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
+            f.write(sitemap_md)
+            temp_path = f.name
+    except Exception:
+        # If writing fails, still return the markdown
+        temp_path = None
+    return sitemap_md, temp_path
 # ======================
+# UI: five-tab interface
 # ======================
 # --- Fetch tab (compact controllable extraction) ---
     submit_btn="Search",
 )
+# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
+sitemap_interface = gr.Interface(
+    fn=Generate_Sitemap_Limited,
+    inputs=[
+        gr.Textbox(
+            label="Website URL",
+            placeholder="https://example.com or example.com"
+        ),
+        gr.Slider(
+            minimum=0,
+            maximum=1000,
+            value=0,
+            step=1,
+            label="Max links per domain (0 = show all)"
+        ),
+    ],
+    outputs=[
+        gr.Markdown(label="Sitemap (Markdown)"),
+        gr.File(label="Download .md"),
+    ],
+    title="Generate Sitemap",
+    description="Group links by Internal/External domains; optionally limit links per domain.",
+    api_description=(
+        "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
+        "Internal or External (per domain). Set a per-domain cap; 0 shows all."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+    submit_btn="Generate",
+)
 # --- Combine all into a single app with tabs ---
 demo = gr.TabbedInterface(
+    interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface, sitemap_interface],
     tab_names=[
         "Fetch Webpage",
         "DuckDuckGo Search (Structured)",
         "DuckDuckGo Search (Raw)",
         "DuckDuckGo Search (Concise)",
+        "Generate Sitemap",
     ],
+    title="Web MCP — Fetch, Search, and Sitemaps with customizable output modes.",
     theme="Nymbo/Nymbo_Theme",
 )
 # Launch the UI and expose all functions as MCP tools in one server
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)