Nymbo commited on
Commit
645af7f
Β·
verified Β·
1 Parent(s): 4fe48d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -61
app.py CHANGED
@@ -1,14 +1,16 @@
1
  # File: main/app.py
2
- # Purpose: One Space that offers four tools/tabs:
3
  # 1) Fetch β€” extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) Websearch β€” structured DuckDuckGo search via LangChain tool (JSON)
5
  # 3) Unstructured DDG β€” raw DuckDuckGo list[dict] rendered into a Textbox
6
  # 4) DDG (Concise) β€” ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
 
7
 
8
  from __future__ import annotations
9
 
10
  import re
11
  import json
 
12
  from typing import List, Dict, Literal, Tuple
13
 
14
  import gradio as gr
@@ -27,6 +29,7 @@ from duckduckgo_search import DDGS
27
  def _http_get(url: str) -> requests.Response:
28
  """
29
  Download the page politely with a short timeout and realistic headers.
 
30
  """
31
  headers = {
32
  "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
@@ -39,6 +42,7 @@ def _http_get(url: str) -> requests.Response:
39
  def _normalize_whitespace(text: str) -> str:
40
  """
41
  Squeeze extra spaces and blank lines to keep things compact.
 
42
  """
43
  text = re.sub(r"[ \t\u00A0]+", " ", text)
44
  text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
@@ -48,6 +52,7 @@ def _normalize_whitespace(text: str) -> str:
48
  def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
49
  """
50
  Cut text if it gets too long; return the text and whether we trimmed.
 
51
  """
52
  if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
53
  return text, False
@@ -57,6 +62,7 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
57
  def _shorten(text: str, limit: int) -> str:
58
  """
59
  Hard cap a string with an ellipsis to keep tokens small.
 
60
  """
61
  if limit <= 0 or len(text) <= limit:
62
  return text
@@ -66,6 +72,7 @@ def _shorten(text: str, limit: int) -> str:
66
  def _domain_of(url: str) -> str:
67
  """
68
  Show a friendly site name like "example.com".
 
69
  """
70
  try:
71
  return urlparse(url).netloc or ""
@@ -86,6 +93,7 @@ def _og(soup: BeautifulSoup, prop: str) -> str | None:
86
  def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
87
  """
88
  Pull the useful bits: title, description, site name, canonical URL, language, etc.
 
89
  """
90
  meta: Dict[str, str] = {}
91
 
@@ -125,6 +133,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
125
  """
126
  Use Readability to isolate the main article and turn it into clean text.
127
  Returns (clean_text, soup_of_readable_html).
 
128
  """
129
  # Simplified article HTML from Readability
130
  doc = Document(html)
@@ -152,6 +161,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
152
  def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
153
  """
154
  Collect clean, unique, absolute links from the readable section only.
 
155
  """
156
  seen = set()
157
  links: List[Tuple[str, str]] = []
@@ -194,6 +204,7 @@ def _format_markdown(
194
  ) -> str:
195
  """
196
  Assemble a compact Markdown summary with optional sections.
 
197
  """
198
  lines: List[str] = []
199
 
@@ -254,25 +265,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
254
  Fetch a web page and return a compact Markdown summary that includes title, key
255
  metadata, readable main text, and outbound links.
256
 
257
- Args:
258
- url (str): The HTTP/HTTPS URL to fetch. Must be publicly reachable.
259
- verbosity (str): Controls body length. One of: "Brief", "Standard", or "Full".
260
- - Brief β‰ˆ up to 1,200 chars
261
- - Standard β‰ˆ up to 3,000 chars
262
- - Full = no cap (still limited by `max_chars` if smaller)
263
- include_metadata (bool): If True, include a Metadata section with description,
264
- site name, canonical URL, language, and fetched URL.
265
- include_text (bool): If True, include the extracted readable body text.
266
- include_links (bool): If True, include a list of outbound links found in the
267
- readable section only (deduped and fragment-stripped).
268
- max_chars (int): Hard cap for body text length. Numeric value between 400 and
269
- 12000. The effective cap is the smaller of this value and the preset based
270
- on `verbosity`.
271
- max_links (int): Maximum number of links to include. Numeric value between 0 and 100.
272
-
273
- Returns:
274
- str: Markdown string containing the extracted summary. If the page cannot be
275
- fetched or parsed, a short error message is returned instead.
276
  """
277
  if not url or not url.strip():
278
  return "Please enter a valid URL."
@@ -336,17 +329,7 @@ def Search_Structured( # <-- MCP tool #2 (Structured DDG)
336
  ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
337
  """
338
  Run a DuckDuckGo search and return structured results as a list of dictionaries.
339
-
340
- Args:
341
- input_query (str): The search query. Supports operators like site:, quotes,
342
- and boolean keywords.
343
- max_results (int): Number of results to return (1–20).
344
-
345
- Returns:
346
- List[Dict[Literal["snippet","title","link"], str]]: Each item contains:
347
- - snippet: Short text snippet
348
- - title: Result title
349
- - link: Result URL
350
  """
351
  if not input_query or not input_query.strip():
352
  return []
@@ -369,13 +352,7 @@ def Search_Raw( # <-- MCP tool #3 (Unstructured DDG)
369
  """
370
  Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
371
  raw Python list of dictionaries from the library.
372
-
373
- Args:
374
- query (str): The search query string.
375
-
376
- Returns:
377
- list[dict]: The unmodified objects returned by `DDGS().text(...)`, typically
378
- containing keys like: title, href/link, body/snippet, source, etc.
379
  """
380
  if not query or not query.strip():
381
  return []
@@ -400,25 +377,8 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
400
  Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
401
  minimize tokens.
402
 
403
- Args:
404
- query (str): The search query string.
405
- max_results (int): Maximum number of results to retrieve (1–20).
406
- include_snippets (bool): If True, include a shortened snippet per result under
407
- key "s".
408
- max_snippet_chars (int): Hard cap for snippet length when `include_snippets`
409
- is True. Range 20–200.
410
- dedupe_domains (bool): If True, only keep the first result per domain.
411
- title_chars (int): Hard cap for the title length. Range 20–120.
412
-
413
- Returns:
414
- str: Newline-delimited JSON (JSONL). Each line is a compact JSON object with
415
- short keys: "t" (title), "u" (URL), and optionally "s" (snippet).
416
-
417
- Example lines:
418
- {"t":"Example","u":"https://example.com/x"}
419
- {"t":"Another…","u":"https://a.com/y","s":"Short snippet…"}
420
  """
421
-
422
  if not query or not query.strip():
423
  return ""
424
 
@@ -426,7 +386,6 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
426
  with DDGS() as ddgs:
427
  raw = ddgs.text(query, max_results=max_results)
428
  except Exception as e:
429
-
430
  return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
431
 
432
  seen_domains = set()
@@ -458,8 +417,133 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
458
  return "\n".join(lines)
459
 
460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  # ======================
462
- # UI: four-tab interface
463
  # ======================
464
 
465
  # --- Fetch tab (compact controllable extraction) ---
@@ -545,19 +629,51 @@ concise_interface = gr.Interface(
545
  submit_btn="Search",
546
  )
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  # --- Combine all into a single app with tabs ---
549
  demo = gr.TabbedInterface(
550
- interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface],
551
  tab_names=[
552
  "Fetch Webpage",
553
  "DuckDuckGo Search (Structured)",
554
  "DuckDuckGo Search (Raw)",
555
  "DuckDuckGo Search (Concise)",
 
556
  ],
557
- title="Web MCP β€” Fetch & DuckDuckGo search with customizable output modes.",
558
  theme="Nymbo/Nymbo_Theme",
559
  )
560
 
561
  # Launch the UI and expose all functions as MCP tools in one server
562
  if __name__ == "__main__":
563
- demo.launch(mcp_server=True)
 
1
  # File: main/app.py
2
+ # Purpose: One Space that offers five tools/tabs:
3
  # 1) Fetch β€” extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) Websearch β€” structured DuckDuckGo search via LangChain tool (JSON)
5
  # 3) Unstructured DDG β€” raw DuckDuckGo list[dict] rendered into a Textbox
6
  # 4) DDG (Concise) β€” ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
7
+ # 5) Generate Sitemap β€” LIMITED: grouped internal/external links with an optional per-domain cap (and a .md download)
8
 
9
  from __future__ import annotations
10
 
11
  import re
12
  import json
13
+ import tempfile # <-- used to create a downloadable .md for the sitemap
14
  from typing import List, Dict, Literal, Tuple
15
 
16
  import gradio as gr
 
29
  def _http_get(url: str) -> requests.Response:
30
  """
31
  Download the page politely with a short timeout and realistic headers.
32
+ (Layman's terms: grab the web page like a normal browser would, but quickly.)
33
  """
34
  headers = {
35
  "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
 
42
  def _normalize_whitespace(text: str) -> str:
43
  """
44
  Squeeze extra spaces and blank lines to keep things compact.
45
+ (Layman's terms: tidy up the text so it’s not full of weird spacing.)
46
  """
47
  text = re.sub(r"[ \t\u00A0]+", " ", text)
48
  text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
 
52
  def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
53
  """
54
  Cut text if it gets too long; return the text and whether we trimmed.
55
+ (Layman's terms: shorten long text and tell us if we had to cut it.)
56
  """
57
  if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
58
  return text, False
 
62
  def _shorten(text: str, limit: int) -> str:
63
  """
64
  Hard cap a string with an ellipsis to keep tokens small.
65
+ (Layman's terms: force a string to a max length with an ellipsis.)
66
  """
67
  if limit <= 0 or len(text) <= limit:
68
  return text
 
72
  def _domain_of(url: str) -> str:
73
  """
74
  Show a friendly site name like "example.com".
75
+ (Layman's terms: pull the website's domain.)
76
  """
77
  try:
78
  return urlparse(url).netloc or ""
 
93
  def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
94
  """
95
  Pull the useful bits: title, description, site name, canonical URL, language, etc.
96
+ (Layman's terms: gather page basics like title/description/address.)
97
  """
98
  meta: Dict[str, str] = {}
99
 
 
133
  """
134
  Use Readability to isolate the main article and turn it into clean text.
135
  Returns (clean_text, soup_of_readable_html).
136
+ (Layman's terms: find the real article text and clean it.)
137
  """
138
  # Simplified article HTML from Readability
139
  doc = Document(html)
 
161
  def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
162
  """
163
  Collect clean, unique, absolute links from the readable section only.
164
+ (Layman's terms: pull a tidy list of links from the article body.)
165
  """
166
  seen = set()
167
  links: List[Tuple[str, str]] = []
 
204
  ) -> str:
205
  """
206
  Assemble a compact Markdown summary with optional sections.
207
+ (Layman's terms: build the final markdown output with options.)
208
  """
209
  lines: List[str] = []
210
 
 
265
  Fetch a web page and return a compact Markdown summary that includes title, key
266
  metadata, readable main text, and outbound links.
267
 
268
+ (Layman's terms: summarize a page with clean text + useful details.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  """
270
  if not url or not url.strip():
271
  return "Please enter a valid URL."
 
329
  ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
330
  """
331
  Run a DuckDuckGo search and return structured results as a list of dictionaries.
332
+ (Layman's terms: search DDG and get clean JSON objects.)
 
 
 
 
 
 
 
 
 
 
333
  """
334
  if not input_query or not input_query.strip():
335
  return []
 
352
  """
353
  Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
354
  raw Python list of dictionaries from the library.
355
+ (Layman's terms: search DDG and show exactly what the library returns.)
 
 
 
 
 
 
356
  """
357
  if not query or not query.strip():
358
  return []
 
377
  Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
378
  minimize tokens.
379
 
380
+ (Layman's terms: the tiniest useful search output possible.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  """
 
382
  if not query or not query.strip():
383
  return ""
384
 
 
386
  with DDGS() as ddgs:
387
  raw = ddgs.text(query, max_results=max_results)
388
  except Exception as e:
 
389
  return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
390
 
391
  seen_domains = set()
 
417
  return "\n".join(lines)
418
 
419
 
420
+ # ============================================
421
+ # Generate Sitemap: LIMITED (new MCP tool #5)
422
+ # ============================================
423
+
424
+ def Generate_Sitemap_Limited(
425
+ url: str,
426
+ max_links_per_domain: int = 0,
427
+ ) -> Tuple[str, str | None]:
428
+ """
429
+ Generate a grouped sitemap from all anchor links on a page, with an optional
430
+ per-domain cap. Returns (markdown, downloadable_file_path).
431
+
432
+ (Layman's terms: list all links on a page, grouped by Internal/External domain.
433
+ You can limit how many per domain; 0 means show all.)
434
+ """
435
+ # --- Basic validation & normalization ---
436
+ if not url or not url.strip():
437
+ return "Please enter a valid URL.", None
438
+
439
+ # If the user forgot the scheme, assume https
440
+ if not url.lower().startswith(("http://", "https://")):
441
+ url = "https://" + url.strip()
442
+
443
+ # --- Fetch the page safely ---
444
+ try:
445
+ resp = _http_get(url)
446
+ resp.raise_for_status()
447
+ except requests.exceptions.RequestException as e:
448
+ return f"Error fetching URL: {str(e)}", None
449
+
450
+ base_url = str(resp.url) # follow redirects and use the final URL
451
+ content_type = resp.headers.get("Content-Type", "")
452
+ if "html" not in content_type.lower():
453
+ return "The provided URL does not appear to be an HTML page.", None
454
+
455
+ # --- Parse and collect links ---
456
+ soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
457
+ anchors = soup.find_all("a", href=True)
458
+
459
+ seen_urls: set[str] = set()
460
+ items: List[Dict[str, str]] = []
461
+
462
+ for a in anchors:
463
+ href = (a.get("href") or "").strip()
464
+ if not href:
465
+ continue
466
+
467
+ # Skip non-navigational/unsupported schemes
468
+ if href.startswith(("#", "javascript:", "mailto:", "tel:")):
469
+ continue
470
+
471
+ # Resolve relative links and strip fragments
472
+ absolute = urljoin(base_url, href)
473
+ absolute, _ = urldefrag(absolute)
474
+
475
+ # Deduplicate and skip self
476
+ if absolute in seen_urls or absolute == base_url:
477
+ continue
478
+ seen_urls.add(absolute)
479
+
480
+ # Use link text if available; otherwise the URL itself
481
+ text = (a.get_text(" ", strip=True) or href).strip()
482
+ if len(text) > 100:
483
+ text = text[:100] + "..."
484
+
485
+ items.append({"text": text, "url": absolute})
486
+
487
+ if not items:
488
+ return "No links found on this page.", None
489
+
490
+ # --- Group by Internal vs External domains ---
491
+ base_netloc = urlparse(base_url).netloc
492
+ domain_groups: Dict[str, List[Dict[str, str]]] = {}
493
+
494
+ for it in items:
495
+ netloc = urlparse(it["url"]).netloc
496
+ key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
497
+ domain_groups.setdefault(key, []).append(it)
498
+
499
+ # --- Build Markdown with optional per-domain limit ---
500
+ total_links = len(items)
501
+ md_lines: List[str] = []
502
+ md_lines.append("# Sitemap")
503
+ md_lines.append(f"Base URL: {base_url}")
504
+ md_lines.append(f"Found {total_links} links:\n")
505
+
506
+ # Show Internal first, then external groups sorted by name
507
+ keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
508
+
509
+ for group_key in keys_sorted:
510
+ if group_key not in domain_groups:
511
+ continue
512
+
513
+ group_links = domain_groups[group_key]
514
+ md_lines.append(f"## {group_key}\n")
515
+
516
+ if max_links_per_domain and max_links_per_domain > 0:
517
+ links_to_show = group_links[:max_links_per_domain]
518
+ remaining = max(0, len(group_links) - max_links_per_domain)
519
+ else:
520
+ links_to_show = group_links
521
+ remaining = 0
522
+
523
+ for link in links_to_show:
524
+ md_lines.append(f"- [{link['text']}]({link['url']})")
525
+
526
+ if remaining > 0:
527
+ md_lines.append(f"- ... and {remaining} more links")
528
+
529
+ md_lines.append("") # blank line after each group
530
+
531
+ sitemap_md = "\n".join(md_lines).strip()
532
+
533
+ # --- Save to a temp .md so the UI can offer a download ---
534
+ try:
535
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
536
+ f.write(sitemap_md)
537
+ temp_path = f.name
538
+ except Exception:
539
+ # If writing fails, still return the markdown
540
+ temp_path = None
541
+
542
+ return sitemap_md, temp_path
543
+
544
+
545
  # ======================
546
+ # UI: five-tab interface
547
  # ======================
548
 
549
  # --- Fetch tab (compact controllable extraction) ---
 
629
  submit_btn="Search",
630
  )
631
 
632
+ # --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
633
+ sitemap_interface = gr.Interface(
634
+ fn=Generate_Sitemap_Limited,
635
+ inputs=[
636
+ gr.Textbox(
637
+ label="Website URL",
638
+ placeholder="https://example.com or example.com"
639
+ ),
640
+ gr.Slider(
641
+ minimum=0,
642
+ maximum=1000,
643
+ value=0,
644
+ step=1,
645
+ label="Max links per domain (0 = show all)"
646
+ ),
647
+ ],
648
+ outputs=[
649
+ gr.Markdown(label="Sitemap (Markdown)"),
650
+ gr.File(label="Download .md"),
651
+ ],
652
+ title="Generate Sitemap",
653
+ description="Group links by Internal/External domains; optionally limit links per domain.",
654
+ api_description=(
655
+ "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
656
+ "Internal or External (per domain). Set a per-domain cap; 0 shows all."
657
+ ),
658
+ allow_flagging="never",
659
+ theme="Nymbo/Nymbo_Theme",
660
+ submit_btn="Generate",
661
+ )
662
+
663
  # --- Combine all into a single app with tabs ---
664
  demo = gr.TabbedInterface(
665
+ interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface, sitemap_interface],
666
  tab_names=[
667
  "Fetch Webpage",
668
  "DuckDuckGo Search (Structured)",
669
  "DuckDuckGo Search (Raw)",
670
  "DuckDuckGo Search (Concise)",
671
+ "Generate Sitemap",
672
  ],
673
+ title="Web MCP β€” Fetch, Search, and Sitemaps with customizable output modes.",
674
  theme="Nymbo/Nymbo_Theme",
675
  )
676
 
677
  # Launch the UI and expose all functions as MCP tools in one server
678
  if __name__ == "__main__":
679
+ demo.launch(mcp_server=True)