Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,16 @@
|
|
1 |
# File: main/app.py
|
2 |
-
# Purpose: One Space that offers
|
3 |
# 1) Fetch β extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) Websearch β structured DuckDuckGo search via LangChain tool (JSON)
|
5 |
# 3) Unstructured DDG β raw DuckDuckGo list[dict] rendered into a Textbox
|
6 |
# 4) DDG (Concise) β ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
|
|
|
7 |
|
8 |
from __future__ import annotations
|
9 |
|
10 |
import re
|
11 |
import json
|
|
|
12 |
from typing import List, Dict, Literal, Tuple
|
13 |
|
14 |
import gradio as gr
|
@@ -27,6 +29,7 @@ from duckduckgo_search import DDGS
|
|
27 |
def _http_get(url: str) -> requests.Response:
|
28 |
"""
|
29 |
Download the page politely with a short timeout and realistic headers.
|
|
|
30 |
"""
|
31 |
headers = {
|
32 |
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
|
@@ -39,6 +42,7 @@ def _http_get(url: str) -> requests.Response:
|
|
39 |
def _normalize_whitespace(text: str) -> str:
|
40 |
"""
|
41 |
Squeeze extra spaces and blank lines to keep things compact.
|
|
|
42 |
"""
|
43 |
text = re.sub(r"[ \t\u00A0]+", " ", text)
|
44 |
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
|
@@ -48,6 +52,7 @@ def _normalize_whitespace(text: str) -> str:
|
|
48 |
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
49 |
"""
|
50 |
Cut text if it gets too long; return the text and whether we trimmed.
|
|
|
51 |
"""
|
52 |
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
|
53 |
return text, False
|
@@ -57,6 +62,7 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
|
57 |
def _shorten(text: str, limit: int) -> str:
|
58 |
"""
|
59 |
Hard cap a string with an ellipsis to keep tokens small.
|
|
|
60 |
"""
|
61 |
if limit <= 0 or len(text) <= limit:
|
62 |
return text
|
@@ -66,6 +72,7 @@ def _shorten(text: str, limit: int) -> str:
|
|
66 |
def _domain_of(url: str) -> str:
|
67 |
"""
|
68 |
Show a friendly site name like "example.com".
|
|
|
69 |
"""
|
70 |
try:
|
71 |
return urlparse(url).netloc or ""
|
@@ -86,6 +93,7 @@ def _og(soup: BeautifulSoup, prop: str) -> str | None:
|
|
86 |
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
|
87 |
"""
|
88 |
Pull the useful bits: title, description, site name, canonical URL, language, etc.
|
|
|
89 |
"""
|
90 |
meta: Dict[str, str] = {}
|
91 |
|
@@ -125,6 +133,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
125 |
"""
|
126 |
Use Readability to isolate the main article and turn it into clean text.
|
127 |
Returns (clean_text, soup_of_readable_html).
|
|
|
128 |
"""
|
129 |
# Simplified article HTML from Readability
|
130 |
doc = Document(html)
|
@@ -152,6 +161,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
152 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
153 |
"""
|
154 |
Collect clean, unique, absolute links from the readable section only.
|
|
|
155 |
"""
|
156 |
seen = set()
|
157 |
links: List[Tuple[str, str]] = []
|
@@ -194,6 +204,7 @@ def _format_markdown(
|
|
194 |
) -> str:
|
195 |
"""
|
196 |
Assemble a compact Markdown summary with optional sections.
|
|
|
197 |
"""
|
198 |
lines: List[str] = []
|
199 |
|
@@ -254,25 +265,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
254 |
Fetch a web page and return a compact Markdown summary that includes title, key
|
255 |
metadata, readable main text, and outbound links.
|
256 |
|
257 |
-
|
258 |
-
url (str): The HTTP/HTTPS URL to fetch. Must be publicly reachable.
|
259 |
-
verbosity (str): Controls body length. One of: "Brief", "Standard", or "Full".
|
260 |
-
- Brief β up to 1,200 chars
|
261 |
-
- Standard β up to 3,000 chars
|
262 |
-
- Full = no cap (still limited by `max_chars` if smaller)
|
263 |
-
include_metadata (bool): If True, include a Metadata section with description,
|
264 |
-
site name, canonical URL, language, and fetched URL.
|
265 |
-
include_text (bool): If True, include the extracted readable body text.
|
266 |
-
include_links (bool): If True, include a list of outbound links found in the
|
267 |
-
readable section only (deduped and fragment-stripped).
|
268 |
-
max_chars (int): Hard cap for body text length. Numeric value between 400 and
|
269 |
-
12000. The effective cap is the smaller of this value and the preset based
|
270 |
-
on `verbosity`.
|
271 |
-
max_links (int): Maximum number of links to include. Numeric value between 0 and 100.
|
272 |
-
|
273 |
-
Returns:
|
274 |
-
str: Markdown string containing the extracted summary. If the page cannot be
|
275 |
-
fetched or parsed, a short error message is returned instead.
|
276 |
"""
|
277 |
if not url or not url.strip():
|
278 |
return "Please enter a valid URL."
|
@@ -336,17 +329,7 @@ def Search_Structured( # <-- MCP tool #2 (Structured DDG)
|
|
336 |
) -> List[Dict[Literal["snippet", "title", "link"], str]]:
|
337 |
"""
|
338 |
Run a DuckDuckGo search and return structured results as a list of dictionaries.
|
339 |
-
|
340 |
-
Args:
|
341 |
-
input_query (str): The search query. Supports operators like site:, quotes,
|
342 |
-
and boolean keywords.
|
343 |
-
max_results (int): Number of results to return (1β20).
|
344 |
-
|
345 |
-
Returns:
|
346 |
-
List[Dict[Literal["snippet","title","link"], str]]: Each item contains:
|
347 |
-
- snippet: Short text snippet
|
348 |
-
- title: Result title
|
349 |
-
- link: Result URL
|
350 |
"""
|
351 |
if not input_query or not input_query.strip():
|
352 |
return []
|
@@ -369,13 +352,7 @@ def Search_Raw( # <-- MCP tool #3 (Unstructured DDG)
|
|
369 |
"""
|
370 |
Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
|
371 |
raw Python list of dictionaries from the library.
|
372 |
-
|
373 |
-
Args:
|
374 |
-
query (str): The search query string.
|
375 |
-
|
376 |
-
Returns:
|
377 |
-
list[dict]: The unmodified objects returned by `DDGS().text(...)`, typically
|
378 |
-
containing keys like: title, href/link, body/snippet, source, etc.
|
379 |
"""
|
380 |
if not query or not query.strip():
|
381 |
return []
|
@@ -400,25 +377,8 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
|
|
400 |
Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
|
401 |
minimize tokens.
|
402 |
|
403 |
-
|
404 |
-
query (str): The search query string.
|
405 |
-
max_results (int): Maximum number of results to retrieve (1β20).
|
406 |
-
include_snippets (bool): If True, include a shortened snippet per result under
|
407 |
-
key "s".
|
408 |
-
max_snippet_chars (int): Hard cap for snippet length when `include_snippets`
|
409 |
-
is True. Range 20β200.
|
410 |
-
dedupe_domains (bool): If True, only keep the first result per domain.
|
411 |
-
title_chars (int): Hard cap for the title length. Range 20β120.
|
412 |
-
|
413 |
-
Returns:
|
414 |
-
str: Newline-delimited JSON (JSONL). Each line is a compact JSON object with
|
415 |
-
short keys: "t" (title), "u" (URL), and optionally "s" (snippet).
|
416 |
-
|
417 |
-
Example lines:
|
418 |
-
{"t":"Example","u":"https://example.com/x"}
|
419 |
-
{"t":"Anotherβ¦","u":"https://a.com/y","s":"Short snippetβ¦"}
|
420 |
"""
|
421 |
-
|
422 |
if not query or not query.strip():
|
423 |
return ""
|
424 |
|
@@ -426,7 +386,6 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
|
|
426 |
with DDGS() as ddgs:
|
427 |
raw = ddgs.text(query, max_results=max_results)
|
428 |
except Exception as e:
|
429 |
-
|
430 |
return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
|
431 |
|
432 |
seen_domains = set()
|
@@ -458,8 +417,133 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
|
|
458 |
return "\n".join(lines)
|
459 |
|
460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
# ======================
|
462 |
-
# UI:
|
463 |
# ======================
|
464 |
|
465 |
# --- Fetch tab (compact controllable extraction) ---
|
@@ -545,19 +629,51 @@ concise_interface = gr.Interface(
|
|
545 |
submit_btn="Search",
|
546 |
)
|
547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
# --- Combine all into a single app with tabs ---
|
549 |
demo = gr.TabbedInterface(
|
550 |
-
interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface],
|
551 |
tab_names=[
|
552 |
"Fetch Webpage",
|
553 |
"DuckDuckGo Search (Structured)",
|
554 |
"DuckDuckGo Search (Raw)",
|
555 |
"DuckDuckGo Search (Concise)",
|
|
|
556 |
],
|
557 |
-
title="Web MCP β Fetch
|
558 |
theme="Nymbo/Nymbo_Theme",
|
559 |
)
|
560 |
|
561 |
# Launch the UI and expose all functions as MCP tools in one server
|
562 |
if __name__ == "__main__":
|
563 |
-
demo.launch(mcp_server=True)
|
|
|
1 |
# File: main/app.py
|
2 |
+
# Purpose: One Space that offers five tools/tabs:
|
3 |
# 1) Fetch β extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) Websearch β structured DuckDuckGo search via LangChain tool (JSON)
|
5 |
# 3) Unstructured DDG β raw DuckDuckGo list[dict] rendered into a Textbox
|
6 |
# 4) DDG (Concise) β ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
|
7 |
+
# 5) Generate Sitemap β LIMITED: grouped internal/external links with an optional per-domain cap (and a .md download)
|
8 |
|
9 |
from __future__ import annotations
|
10 |
|
11 |
import re
|
12 |
import json
|
13 |
+
import tempfile # <-- used to create a downloadable .md for the sitemap
|
14 |
from typing import List, Dict, Literal, Tuple
|
15 |
|
16 |
import gradio as gr
|
|
|
29 |
def _http_get(url: str) -> requests.Response:
|
30 |
"""
|
31 |
Download the page politely with a short timeout and realistic headers.
|
32 |
+
(Layman's terms: grab the web page like a normal browser would, but quickly.)
|
33 |
"""
|
34 |
headers = {
|
35 |
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
|
|
|
42 |
def _normalize_whitespace(text: str) -> str:
|
43 |
"""
|
44 |
Squeeze extra spaces and blank lines to keep things compact.
|
45 |
+
(Layman's terms: tidy up the text so itβs not full of weird spacing.)
|
46 |
"""
|
47 |
text = re.sub(r"[ \t\u00A0]+", " ", text)
|
48 |
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
|
|
|
52 |
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
53 |
"""
|
54 |
Cut text if it gets too long; return the text and whether we trimmed.
|
55 |
+
(Layman's terms: shorten long text and tell us if we had to cut it.)
|
56 |
"""
|
57 |
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
|
58 |
return text, False
|
|
|
62 |
def _shorten(text: str, limit: int) -> str:
|
63 |
"""
|
64 |
Hard cap a string with an ellipsis to keep tokens small.
|
65 |
+
(Layman's terms: force a string to a max length with an ellipsis.)
|
66 |
"""
|
67 |
if limit <= 0 or len(text) <= limit:
|
68 |
return text
|
|
|
72 |
def _domain_of(url: str) -> str:
|
73 |
"""
|
74 |
Show a friendly site name like "example.com".
|
75 |
+
(Layman's terms: pull the website's domain.)
|
76 |
"""
|
77 |
try:
|
78 |
return urlparse(url).netloc or ""
|
|
|
93 |
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
|
94 |
"""
|
95 |
Pull the useful bits: title, description, site name, canonical URL, language, etc.
|
96 |
+
(Layman's terms: gather page basics like title/description/address.)
|
97 |
"""
|
98 |
meta: Dict[str, str] = {}
|
99 |
|
|
|
133 |
"""
|
134 |
Use Readability to isolate the main article and turn it into clean text.
|
135 |
Returns (clean_text, soup_of_readable_html).
|
136 |
+
(Layman's terms: find the real article text and clean it.)
|
137 |
"""
|
138 |
# Simplified article HTML from Readability
|
139 |
doc = Document(html)
|
|
|
161 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
162 |
"""
|
163 |
Collect clean, unique, absolute links from the readable section only.
|
164 |
+
(Layman's terms: pull a tidy list of links from the article body.)
|
165 |
"""
|
166 |
seen = set()
|
167 |
links: List[Tuple[str, str]] = []
|
|
|
204 |
) -> str:
|
205 |
"""
|
206 |
Assemble a compact Markdown summary with optional sections.
|
207 |
+
(Layman's terms: build the final markdown output with options.)
|
208 |
"""
|
209 |
lines: List[str] = []
|
210 |
|
|
|
265 |
Fetch a web page and return a compact Markdown summary that includes title, key
|
266 |
metadata, readable main text, and outbound links.
|
267 |
|
268 |
+
(Layman's terms: summarize a page with clean text + useful details.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
"""
|
270 |
if not url or not url.strip():
|
271 |
return "Please enter a valid URL."
|
|
|
329 |
) -> List[Dict[Literal["snippet", "title", "link"], str]]:
|
330 |
"""
|
331 |
Run a DuckDuckGo search and return structured results as a list of dictionaries.
|
332 |
+
(Layman's terms: search DDG and get clean JSON objects.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
"""
|
334 |
if not input_query or not input_query.strip():
|
335 |
return []
|
|
|
352 |
"""
|
353 |
Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
|
354 |
raw Python list of dictionaries from the library.
|
355 |
+
(Layman's terms: search DDG and show exactly what the library returns.)
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
"""
|
357 |
if not query or not query.strip():
|
358 |
return []
|
|
|
377 |
Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
|
378 |
minimize tokens.
|
379 |
|
380 |
+
(Layman's terms: the tiniest useful search output possible.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
"""
|
|
|
382 |
if not query or not query.strip():
|
383 |
return ""
|
384 |
|
|
|
386 |
with DDGS() as ddgs:
|
387 |
raw = ddgs.text(query, max_results=max_results)
|
388 |
except Exception as e:
|
|
|
389 |
return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
|
390 |
|
391 |
seen_domains = set()
|
|
|
417 |
return "\n".join(lines)
|
418 |
|
419 |
|
420 |
+
# ============================================
|
421 |
+
# Generate Sitemap: LIMITED (new MCP tool #5)
|
422 |
+
# ============================================
|
423 |
+
|
424 |
+
def Generate_Sitemap_Limited(
|
425 |
+
url: str,
|
426 |
+
max_links_per_domain: int = 0,
|
427 |
+
) -> Tuple[str, str | None]:
|
428 |
+
"""
|
429 |
+
Generate a grouped sitemap from all anchor links on a page, with an optional
|
430 |
+
per-domain cap. Returns (markdown, downloadable_file_path).
|
431 |
+
|
432 |
+
(Layman's terms: list all links on a page, grouped by Internal/External domain.
|
433 |
+
You can limit how many per domain; 0 means show all.)
|
434 |
+
"""
|
435 |
+
# --- Basic validation & normalization ---
|
436 |
+
if not url or not url.strip():
|
437 |
+
return "Please enter a valid URL.", None
|
438 |
+
|
439 |
+
# If the user forgot the scheme, assume https
|
440 |
+
if not url.lower().startswith(("http://", "https://")):
|
441 |
+
url = "https://" + url.strip()
|
442 |
+
|
443 |
+
# --- Fetch the page safely ---
|
444 |
+
try:
|
445 |
+
resp = _http_get(url)
|
446 |
+
resp.raise_for_status()
|
447 |
+
except requests.exceptions.RequestException as e:
|
448 |
+
return f"Error fetching URL: {str(e)}", None
|
449 |
+
|
450 |
+
base_url = str(resp.url) # follow redirects and use the final URL
|
451 |
+
content_type = resp.headers.get("Content-Type", "")
|
452 |
+
if "html" not in content_type.lower():
|
453 |
+
return "The provided URL does not appear to be an HTML page.", None
|
454 |
+
|
455 |
+
# --- Parse and collect links ---
|
456 |
+
soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
|
457 |
+
anchors = soup.find_all("a", href=True)
|
458 |
+
|
459 |
+
seen_urls: set[str] = set()
|
460 |
+
items: List[Dict[str, str]] = []
|
461 |
+
|
462 |
+
for a in anchors:
|
463 |
+
href = (a.get("href") or "").strip()
|
464 |
+
if not href:
|
465 |
+
continue
|
466 |
+
|
467 |
+
# Skip non-navigational/unsupported schemes
|
468 |
+
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
469 |
+
continue
|
470 |
+
|
471 |
+
# Resolve relative links and strip fragments
|
472 |
+
absolute = urljoin(base_url, href)
|
473 |
+
absolute, _ = urldefrag(absolute)
|
474 |
+
|
475 |
+
# Deduplicate and skip self
|
476 |
+
if absolute in seen_urls or absolute == base_url:
|
477 |
+
continue
|
478 |
+
seen_urls.add(absolute)
|
479 |
+
|
480 |
+
# Use link text if available; otherwise the URL itself
|
481 |
+
text = (a.get_text(" ", strip=True) or href).strip()
|
482 |
+
if len(text) > 100:
|
483 |
+
text = text[:100] + "..."
|
484 |
+
|
485 |
+
items.append({"text": text, "url": absolute})
|
486 |
+
|
487 |
+
if not items:
|
488 |
+
return "No links found on this page.", None
|
489 |
+
|
490 |
+
# --- Group by Internal vs External domains ---
|
491 |
+
base_netloc = urlparse(base_url).netloc
|
492 |
+
domain_groups: Dict[str, List[Dict[str, str]]] = {}
|
493 |
+
|
494 |
+
for it in items:
|
495 |
+
netloc = urlparse(it["url"]).netloc
|
496 |
+
key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
|
497 |
+
domain_groups.setdefault(key, []).append(it)
|
498 |
+
|
499 |
+
# --- Build Markdown with optional per-domain limit ---
|
500 |
+
total_links = len(items)
|
501 |
+
md_lines: List[str] = []
|
502 |
+
md_lines.append("# Sitemap")
|
503 |
+
md_lines.append(f"Base URL: {base_url}")
|
504 |
+
md_lines.append(f"Found {total_links} links:\n")
|
505 |
+
|
506 |
+
# Show Internal first, then external groups sorted by name
|
507 |
+
keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
|
508 |
+
|
509 |
+
for group_key in keys_sorted:
|
510 |
+
if group_key not in domain_groups:
|
511 |
+
continue
|
512 |
+
|
513 |
+
group_links = domain_groups[group_key]
|
514 |
+
md_lines.append(f"## {group_key}\n")
|
515 |
+
|
516 |
+
if max_links_per_domain and max_links_per_domain > 0:
|
517 |
+
links_to_show = group_links[:max_links_per_domain]
|
518 |
+
remaining = max(0, len(group_links) - max_links_per_domain)
|
519 |
+
else:
|
520 |
+
links_to_show = group_links
|
521 |
+
remaining = 0
|
522 |
+
|
523 |
+
for link in links_to_show:
|
524 |
+
md_lines.append(f"- [{link['text']}]({link['url']})")
|
525 |
+
|
526 |
+
if remaining > 0:
|
527 |
+
md_lines.append(f"- ... and {remaining} more links")
|
528 |
+
|
529 |
+
md_lines.append("") # blank line after each group
|
530 |
+
|
531 |
+
sitemap_md = "\n".join(md_lines).strip()
|
532 |
+
|
533 |
+
# --- Save to a temp .md so the UI can offer a download ---
|
534 |
+
try:
|
535 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
|
536 |
+
f.write(sitemap_md)
|
537 |
+
temp_path = f.name
|
538 |
+
except Exception:
|
539 |
+
# If writing fails, still return the markdown
|
540 |
+
temp_path = None
|
541 |
+
|
542 |
+
return sitemap_md, temp_path
|
543 |
+
|
544 |
+
|
545 |
# ======================
|
546 |
+
# UI: five-tab interface
|
547 |
# ======================
|
548 |
|
549 |
# --- Fetch tab (compact controllable extraction) ---
|
|
|
629 |
submit_btn="Search",
|
630 |
)
|
631 |
|
632 |
+
# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
|
633 |
+
sitemap_interface = gr.Interface(
|
634 |
+
fn=Generate_Sitemap_Limited,
|
635 |
+
inputs=[
|
636 |
+
gr.Textbox(
|
637 |
+
label="Website URL",
|
638 |
+
placeholder="https://example.com or example.com"
|
639 |
+
),
|
640 |
+
gr.Slider(
|
641 |
+
minimum=0,
|
642 |
+
maximum=1000,
|
643 |
+
value=0,
|
644 |
+
step=1,
|
645 |
+
label="Max links per domain (0 = show all)"
|
646 |
+
),
|
647 |
+
],
|
648 |
+
outputs=[
|
649 |
+
gr.Markdown(label="Sitemap (Markdown)"),
|
650 |
+
gr.File(label="Download .md"),
|
651 |
+
],
|
652 |
+
title="Generate Sitemap",
|
653 |
+
description="Group links by Internal/External domains; optionally limit links per domain.",
|
654 |
+
api_description=(
|
655 |
+
"Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
|
656 |
+
"Internal or External (per domain). Set a per-domain cap; 0 shows all."
|
657 |
+
),
|
658 |
+
allow_flagging="never",
|
659 |
+
theme="Nymbo/Nymbo_Theme",
|
660 |
+
submit_btn="Generate",
|
661 |
+
)
|
662 |
+
|
663 |
# --- Combine all into a single app with tabs ---
|
664 |
demo = gr.TabbedInterface(
|
665 |
+
interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface, sitemap_interface],
|
666 |
tab_names=[
|
667 |
"Fetch Webpage",
|
668 |
"DuckDuckGo Search (Structured)",
|
669 |
"DuckDuckGo Search (Raw)",
|
670 |
"DuckDuckGo Search (Concise)",
|
671 |
+
"Generate Sitemap",
|
672 |
],
|
673 |
+
title="Web MCP β Fetch, Search, and Sitemaps with customizable output modes.",
|
674 |
theme="Nymbo/Nymbo_Theme",
|
675 |
)
|
676 |
|
677 |
# Launch the UI and expose all functions as MCP tools in one server
|
678 |
if __name__ == "__main__":
|
679 |
+
demo.launch(mcp_server=True)
|