Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,6 @@ from __future__ import annotations
|
|
10 |
|
11 |
import re
|
12 |
import json
|
13 |
-
import tempfile # <-- used to create a downloadable .md for the sitemap
|
14 |
from typing import List, Dict, Literal, Tuple
|
15 |
|
16 |
import gradio as gr
|
@@ -418,23 +417,31 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
|
|
418 |
|
419 |
|
420 |
# ============================================
|
421 |
-
# Generate Sitemap
|
422 |
# ============================================
|
423 |
|
424 |
-
def
|
425 |
url: str,
|
426 |
max_links_per_domain: int = 0,
|
427 |
-
) ->
|
428 |
"""
|
429 |
-
Generate a grouped sitemap
|
430 |
-
per-domain cap.
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
"""
|
435 |
# --- Basic validation & normalization ---
|
436 |
if not url or not url.strip():
|
437 |
-
return "Please enter a valid URL."
|
438 |
|
439 |
# If the user forgot the scheme, assume https
|
440 |
if not url.lower().startswith(("http://", "https://")):
|
@@ -445,12 +452,12 @@ def Generate_Sitemap_Limited(
|
|
445 |
resp = _http_get(url)
|
446 |
resp.raise_for_status()
|
447 |
except requests.exceptions.RequestException as e:
|
448 |
-
return f"Error fetching URL: {str(e)}"
|
449 |
|
450 |
base_url = str(resp.url) # follow redirects and use the final URL
|
451 |
content_type = resp.headers.get("Content-Type", "")
|
452 |
if "html" not in content_type.lower():
|
453 |
-
return "The provided URL does not appear to be an HTML page."
|
454 |
|
455 |
# --- Parse and collect links ---
|
456 |
soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
|
@@ -485,7 +492,7 @@ def Generate_Sitemap_Limited(
|
|
485 |
items.append({"text": text, "url": absolute})
|
486 |
|
487 |
if not items:
|
488 |
-
return "No links found on this page."
|
489 |
|
490 |
# --- Group by Internal vs External domains ---
|
491 |
base_netloc = urlparse(base_url).netloc
|
@@ -529,17 +536,7 @@ def Generate_Sitemap_Limited(
|
|
529 |
md_lines.append("") # blank line after each group
|
530 |
|
531 |
sitemap_md = "\n".join(md_lines).strip()
|
532 |
-
|
533 |
-
# --- Save to a temp .md so the UI can offer a download ---
|
534 |
-
try:
|
535 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
|
536 |
-
f.write(sitemap_md)
|
537 |
-
temp_path = f.name
|
538 |
-
except Exception:
|
539 |
-
# If writing fails, still return the markdown
|
540 |
-
temp_path = None
|
541 |
-
|
542 |
-
return sitemap_md, temp_path
|
543 |
|
544 |
|
545 |
# ======================
|
@@ -631,7 +628,7 @@ concise_interface = gr.Interface(
|
|
631 |
|
632 |
# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
|
633 |
sitemap_interface = gr.Interface(
|
634 |
-
fn=
|
635 |
inputs=[
|
636 |
gr.Textbox(
|
637 |
label="Website URL",
|
@@ -645,10 +642,7 @@ sitemap_interface = gr.Interface(
|
|
645 |
label="Max links per domain (0 = show all)"
|
646 |
),
|
647 |
],
|
648 |
-
outputs=
|
649 |
-
gr.Markdown(label="Sitemap (Markdown)"),
|
650 |
-
gr.File(label="Download .md"),
|
651 |
-
],
|
652 |
title="Generate Sitemap",
|
653 |
description="Group links by Internal/External domains; optionally limit links per domain.",
|
654 |
api_description=(
|
|
|
10 |
|
11 |
import re
|
12 |
import json
|
|
|
13 |
from typing import List, Dict, Literal, Tuple
|
14 |
|
15 |
import gradio as gr
|
|
|
417 |
|
418 |
|
419 |
# ============================================
|
420 |
+
# Generate Sitemap (new MCP tool #5)
|
421 |
# ============================================
|
422 |
|
423 |
+
def Generate_Sitemap(
|
424 |
url: str,
|
425 |
max_links_per_domain: int = 0,
|
426 |
+
) -> str:
|
427 |
"""
|
428 |
+
Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
|
429 |
+
per-domain cap.
|
430 |
+
|
431 |
+
Args:
|
432 |
+
url (str): The starting page URL (http/https). If the scheme is omitted,
|
433 |
+
https is assumed.
|
434 |
+
max_links_per_domain (int): Limit the number of links shown per domain.
|
435 |
+
Use 0 to show all links.
|
436 |
+
|
437 |
+
Returns:
|
438 |
+
str: Markdown text containing grouped links under "Internal Links" and
|
439 |
+
per-domain "External Links (domain)" sections. If an error occurs or no
|
440 |
+
links are found, a short message is returned.
|
441 |
"""
|
442 |
# --- Basic validation & normalization ---
|
443 |
if not url or not url.strip():
|
444 |
+
return "Please enter a valid URL."
|
445 |
|
446 |
# If the user forgot the scheme, assume https
|
447 |
if not url.lower().startswith(("http://", "https://")):
|
|
|
452 |
resp = _http_get(url)
|
453 |
resp.raise_for_status()
|
454 |
except requests.exceptions.RequestException as e:
|
455 |
+
return f"Error fetching URL: {str(e)}"
|
456 |
|
457 |
base_url = str(resp.url) # follow redirects and use the final URL
|
458 |
content_type = resp.headers.get("Content-Type", "")
|
459 |
if "html" not in content_type.lower():
|
460 |
+
return "The provided URL does not appear to be an HTML page."
|
461 |
|
462 |
# --- Parse and collect links ---
|
463 |
soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
|
|
|
492 |
items.append({"text": text, "url": absolute})
|
493 |
|
494 |
if not items:
|
495 |
+
return "No links found on this page."
|
496 |
|
497 |
# --- Group by Internal vs External domains ---
|
498 |
base_netloc = urlparse(base_url).netloc
|
|
|
536 |
md_lines.append("") # blank line after each group
|
537 |
|
538 |
sitemap_md = "\n".join(md_lines).strip()
|
539 |
+
return sitemap_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
540 |
|
541 |
|
542 |
# ======================
|
|
|
628 |
|
629 |
# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
|
630 |
sitemap_interface = gr.Interface(
|
631 |
+
fn=Generate_Sitemap,
|
632 |
inputs=[
|
633 |
gr.Textbox(
|
634 |
label="Website URL",
|
|
|
642 |
label="Max links per domain (0 = show all)"
|
643 |
),
|
644 |
],
|
645 |
+
outputs=gr.Markdown(label="Sitemap (Markdown)"),
|
|
|
|
|
|
|
646 |
title="Generate Sitemap",
|
647 |
description="Group links by Internal/External domains; optionally limit links per domain.",
|
648 |
api_description=(
|