Nymbo commited on
Commit
21f0d84
·
verified ·
1 Parent(s): 0a0a050

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -17
app.py CHANGED
@@ -1,25 +1,27 @@
1
  # File: main/app.py
2
- # Purpose: One Space that offers three tools/tabs:
3
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
5
- # 3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox (matches your app)
 
6
  #
7
  # Launched with mcp_server=True so all functions are available as MCP tools.
8
 
9
  from __future__ import annotations
10
 
11
- import re # (layman) used to tidy up whitespace
 
12
  from typing import List, Dict, Literal, Tuple
13
 
14
- import gradio as gr # (layman) the UI framework
15
- import requests # (layman) to download web pages
16
- from bs4 import BeautifulSoup # (layman) for parsing HTML
17
- from readability import Document # (layman) to isolate main readable content
18
- from urllib.parse import urljoin, urldefrag, urlparse # (layman) to fix/clean URLs
19
 
20
  # Structured search via LangChain community tool
21
  from langchain_community.tools import DuckDuckGoSearchResults
22
- # Unstructured search using the native DDG client (matches your separate space)
23
  from duckduckgo_search import DDGS
24
 
25
 
@@ -57,6 +59,15 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
57
  return text[:max_chars].rstrip() + " …", True
58
 
59
 
 
 
 
 
 
 
 
 
 
60
  def _domain_of(url: str) -> str:
61
  """
62
  (layman) Show a friendly site name like "example.com".
@@ -338,9 +349,75 @@ def ddg_unstructured( # <-- MCP tool #3 (Unstructured DDG)
338
  return results
339
 
340
 
341
- # =====================
342
- # UI: three-tab interface
343
- # =====================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  # --- Fetch tab (compact controllable extraction) ---
346
  fetch_interface = gr.Interface(
@@ -381,17 +458,36 @@ unstructured_interface = gr.Interface(
381
  inputs=gr.Textbox(label="Enter Search Query"), # (layman) same single input
382
  outputs=gr.Textbox(label="Results", interactive=False), # (layman) Textbox showing str(list[dict])
383
  title="Unstructured DDG (Raw List)", # (layman) clear label
384
- description="Returns the raw list of results (list[dict]) shown as text.", # (layman) behavior note
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  allow_flagging="never",
386
  theme="Nymbo/Nymbo_Theme",
387
- submit_btn="Search", # (layman) match your original button label
388
  )
389
 
390
  # --- Combine all into a single app with tabs ---
391
  demo = gr.TabbedInterface(
392
- interface_list=[fetch_interface, websearch_interface, unstructured_interface],
393
- tab_names=["Fetch", "Websearch", "Unstructured DDG"],
394
- title="Web MCP — Fetch + Websearch + Unstructured DDG",
395
  theme="Nymbo/Nymbo_Theme",
396
  )
397
 
 
1
  # File: main/app.py
2
+ # Purpose: One Space that offers four tools/tabs:
3
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
5
+ # 3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
6
+ # 4) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
7
  #
8
  # Launched with mcp_server=True so all functions are available as MCP tools.
9
 
10
  from __future__ import annotations
11
 
12
+ import re # (layman) whitespace cleanup and trimming
13
+ import json # (layman) to produce compact JSON lines
14
  from typing import List, Dict, Literal, Tuple
15
 
16
+ import gradio as gr # (layman) UI framework
17
+ import requests # (layman) to download web pages
18
+ from bs4 import BeautifulSoup # (layman) HTML parsing
19
+ from readability import Document # (layman) isolate main article content
20
+ from urllib.parse import urljoin, urldefrag, urlparse # (layman) URL helpers
21
 
22
  # Structured search via LangChain community tool
23
  from langchain_community.tools import DuckDuckGoSearchResults
24
+ # Native DDG client (used by Unstructured + Concise tabs)
25
  from duckduckgo_search import DDGS
26
 
27
 
 
59
  return text[:max_chars].rstrip() + " …", True
60
 
61
 
62
+ def _shorten(text: str, limit: int) -> str:
63
+ """
64
+ (layman) Hard cap a string with an ellipsis to keep tokens small.
65
+ """
66
+ if limit <= 0 or len(text) <= limit:
67
+ return text
68
+ return text[: max(0, limit - 1)].rstrip() + "…"
69
+
70
+
71
  def _domain_of(url: str) -> str:
72
  """
73
  (layman) Show a friendly site name like "example.com".
 
349
  return results
350
 
351
 
352
+ # ============================================
353
+ # Concise DDG: ultra-succinct JSONL for tokens
354
+ # ============================================
355
+
356
+ def ddg_concise( # <-- MCP tool #4 (Concise DDG)
357
+ query: str,
358
+ max_results: int = 5,
359
+ include_snippets: bool = False,
360
+ max_snippet_chars: int = 80,
361
+ dedupe_domains: bool = True,
362
+ title_chars: int = 80,
363
+ ) -> str:
364
+ """
365
+ (layman) Minimal-output DuckDuckGo search designed to reduce tokens:
366
+ - Returns newline-delimited JSON (JSONL) with short keys:
367
+ t=title, u=url, s=snippet (optional)
368
+ - Titles and snippets are hard-capped to save tokens.
369
+ - Domain dedupe on by default to avoid near-duplicates.
370
+
371
+ Returns:
372
+ A compact string like:
373
+ {"t":"Example","u":"https://example.com/x"}
374
+ {"t":"Another…","u":"https://a.com/y","s":"Short snippet…"}
375
+ """
376
+ # Fast guard for empty input
377
+ if not query or not query.strip():
378
+ return ""
379
+
380
+ # Perform the search
381
+ try:
382
+ with DDGS() as ddgs:
383
+ raw = ddgs.text(query, max_results=max_results)
384
+ except Exception as e:
385
+ # (layman) Minimal error payload to keep tokens low
386
+ return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
387
+
388
+ # Optionally dedupe by domain to reduce redundant lines
389
+ seen_domains = set()
390
+ lines: List[str] = []
391
+
392
+ for r in raw or []:
393
+ title = _shorten((r.get("title") or "").strip(), title_chars)
394
+ url = (r.get("href") or r.get("link") or "").strip()
395
+ body = (r.get("body") or r.get("snippet") or "").strip()
396
+
397
+ if not url:
398
+ continue
399
+
400
+ if dedupe_domains:
401
+ dom = _domain_of(url)
402
+ if dom in seen_domains:
403
+ continue
404
+ seen_domains.add(dom)
405
+
406
+ obj = {"t": title or _domain_of(url), "u": url}
407
+
408
+ if include_snippets and body:
409
+ obj["s"] = _shorten(body, max_snippet_chars)
410
+
411
+ # Emit most compact JSON possible (no spaces)
412
+ lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
413
+
414
+ # Join as JSONL (each result on its own line)
415
+ return "\n".join(lines)
416
+
417
+
418
+ # ======================
419
+ # UI: four-tab interface
420
+ # ======================
421
 
422
  # --- Fetch tab (compact controllable extraction) ---
423
  fetch_interface = gr.Interface(
 
458
  inputs=gr.Textbox(label="Enter Search Query"), # (layman) same single input
459
  outputs=gr.Textbox(label="Results", interactive=False), # (layman) Textbox showing str(list[dict])
460
  title="Unstructured DDG (Raw List)", # (layman) clear label
461
+ description="Returns the raw list of results (list[dict]) shown as text.",
462
+ allow_flagging="never",
463
+ theme="Nymbo/Nymbo_Theme",
464
+ submit_btn="Search",
465
+ )
466
+
467
+ # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
468
+ concise_interface = gr.Interface(
469
+ fn=ddg_concise, # (layman) ultra-succinct search
470
+ inputs=[
471
+ gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
472
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
473
+ gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
474
+ gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
475
+ gr.Checkbox(value=True, label="Dedupe by domain"),
476
+ gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
477
+ ],
478
+ outputs=gr.Textbox(label="Results (JSONL)", interactive=False), # (layman) copy-friendly minimal output
479
+ title="DDG (Concise) — Minimal Tokens",
480
+ description="Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.",
481
  allow_flagging="never",
482
  theme="Nymbo/Nymbo_Theme",
483
+ submit_btn="Search",
484
  )
485
 
486
  # --- Combine all into a single app with tabs ---
487
  demo = gr.TabbedInterface(
488
+ interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface],
489
+ tab_names=["Fetch", "Websearch", "Unstructured DDG", "DDG (Concise)"],
490
+ title="Web MCP — Fetch + Websearch + Unstructured + Concise",
491
  theme="Nymbo/Nymbo_Theme",
492
  )
493