Nymbo commited on
Commit
582b9a7
·
verified ·
1 Parent(s): bea2597

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -29
app.py CHANGED
@@ -10,7 +10,6 @@ from __future__ import annotations
10
 
11
  import re
12
  import json
13
- import tempfile # <-- used to create a downloadable .md for the sitemap
14
  from typing import List, Dict, Literal, Tuple
15
 
16
  import gradio as gr
@@ -418,23 +417,31 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
418
 
419
 
420
  # ============================================
421
- # Generate Sitemap: LIMITED (new MCP tool #5)
422
  # ============================================
423
 
424
- def Generate_Sitemap_Limited(
425
  url: str,
426
  max_links_per_domain: int = 0,
427
- ) -> Tuple[str, str | None]:
428
  """
429
- Generate a grouped sitemap from all anchor links on a page, with an optional
430
- per-domain cap. Returns (markdown, downloadable_file_path).
431
-
432
- (Layman's terms: list all links on a page, grouped by Internal/External domain.
433
- You can limit how many per domain; 0 means show all.)
 
 
 
 
 
 
 
 
434
  """
435
  # --- Basic validation & normalization ---
436
  if not url or not url.strip():
437
- return "Please enter a valid URL.", None
438
 
439
  # If the user forgot the scheme, assume https
440
  if not url.lower().startswith(("http://", "https://")):
@@ -445,12 +452,12 @@ def Generate_Sitemap_Limited(
445
  resp = _http_get(url)
446
  resp.raise_for_status()
447
  except requests.exceptions.RequestException as e:
448
- return f"Error fetching URL: {str(e)}", None
449
 
450
  base_url = str(resp.url) # follow redirects and use the final URL
451
  content_type = resp.headers.get("Content-Type", "")
452
  if "html" not in content_type.lower():
453
- return "The provided URL does not appear to be an HTML page.", None
454
 
455
  # --- Parse and collect links ---
456
  soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
@@ -485,7 +492,7 @@ def Generate_Sitemap_Limited(
485
  items.append({"text": text, "url": absolute})
486
 
487
  if not items:
488
- return "No links found on this page.", None
489
 
490
  # --- Group by Internal vs External domains ---
491
  base_netloc = urlparse(base_url).netloc
@@ -529,17 +536,7 @@ def Generate_Sitemap_Limited(
529
  md_lines.append("") # blank line after each group
530
 
531
  sitemap_md = "\n".join(md_lines).strip()
532
-
533
- # --- Save to a temp .md so the UI can offer a download ---
534
- try:
535
- with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
536
- f.write(sitemap_md)
537
- temp_path = f.name
538
- except Exception:
539
- # If writing fails, still return the markdown
540
- temp_path = None
541
-
542
- return sitemap_md, temp_path
543
 
544
 
545
  # ======================
@@ -631,7 +628,7 @@ concise_interface = gr.Interface(
631
 
632
  # --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
633
  sitemap_interface = gr.Interface(
634
- fn=Generate_Sitemap_Limited,
635
  inputs=[
636
  gr.Textbox(
637
  label="Website URL",
@@ -645,10 +642,7 @@ sitemap_interface = gr.Interface(
645
  label="Max links per domain (0 = show all)"
646
  ),
647
  ],
648
- outputs=[
649
- gr.Markdown(label="Sitemap (Markdown)"),
650
- gr.File(label="Download .md"),
651
- ],
652
  title="Generate Sitemap",
653
  description="Group links by Internal/External domains; optionally limit links per domain.",
654
  api_description=(
 
10
 
11
  import re
12
  import json
 
13
  from typing import List, Dict, Literal, Tuple
14
 
15
  import gradio as gr
 
417
 
418
 
419
  # ============================================
420
+ # Generate Sitemap (new MCP tool #5)
421
  # ============================================
422
 
423
+ def Generate_Sitemap(
424
  url: str,
425
  max_links_per_domain: int = 0,
426
+ ) -> str:
427
  """
428
+ Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
429
+ per-domain cap.
430
+
431
+ Args:
432
+ url (str): The starting page URL (http/https). If the scheme is omitted,
433
+ https is assumed.
434
+ max_links_per_domain (int): Limit the number of links shown per domain.
435
+ Use 0 to show all links.
436
+
437
+ Returns:
438
+ str: Markdown text containing grouped links under "Internal Links" and
439
+ per-domain "External Links (domain)" sections. If an error occurs or no
440
+ links are found, a short message is returned.
441
  """
442
  # --- Basic validation & normalization ---
443
  if not url or not url.strip():
444
+ return "Please enter a valid URL."
445
 
446
  # If the user forgot the scheme, assume https
447
  if not url.lower().startswith(("http://", "https://")):
 
452
  resp = _http_get(url)
453
  resp.raise_for_status()
454
  except requests.exceptions.RequestException as e:
455
+ return f"Error fetching URL: {str(e)}"
456
 
457
  base_url = str(resp.url) # follow redirects and use the final URL
458
  content_type = resp.headers.get("Content-Type", "")
459
  if "html" not in content_type.lower():
460
+ return "The provided URL does not appear to be an HTML page."
461
 
462
  # --- Parse and collect links ---
463
  soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
 
492
  items.append({"text": text, "url": absolute})
493
 
494
  if not items:
495
+ return "No links found on this page."
496
 
497
  # --- Group by Internal vs External domains ---
498
  base_netloc = urlparse(base_url).netloc
 
536
  md_lines.append("") # blank line after each group
537
 
538
  sitemap_md = "\n".join(md_lines).strip()
539
+ return sitemap_md
 
 
 
 
 
 
 
 
 
 
540
 
541
 
542
  # ======================
 
628
 
629
  # --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
630
  sitemap_interface = gr.Interface(
631
+ fn=Generate_Sitemap,
632
  inputs=[
633
  gr.Textbox(
634
  label="Website URL",
 
642
  label="Max links per domain (0 = show all)"
643
  ),
644
  ],
645
+ outputs=gr.Markdown(label="Sitemap (Markdown)"),
 
 
 
646
  title="Generate Sitemap",
647
  description="Group links by Internal/External domains; optionally limit links per domain.",
648
  api_description=(