Update app.py
Browse files
app.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1 |
# File: main/app.py
|
2 |
-
# Purpose: One Space that offers
|
3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
|
5 |
-
# 3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
|
|
|
6 |
#
|
7 |
# Launched with mcp_server=True so all functions are available as MCP tools.
|
8 |
|
9 |
from __future__ import annotations
|
10 |
|
11 |
-
import re
|
|
|
12 |
from typing import List, Dict, Literal, Tuple
|
13 |
|
14 |
-
import gradio as gr
|
15 |
-
import requests
|
16 |
-
from bs4 import BeautifulSoup
|
17 |
-
from readability import Document
|
18 |
-
from urllib.parse import urljoin, urldefrag, urlparse # (layman)
|
19 |
|
20 |
# Structured search via LangChain community tool
|
21 |
from langchain_community.tools import DuckDuckGoSearchResults
|
22 |
-
#
|
23 |
from duckduckgo_search import DDGS
|
24 |
|
25 |
|
@@ -57,6 +59,15 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
|
57 |
return text[:max_chars].rstrip() + " …", True
|
58 |
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def _domain_of(url: str) -> str:
|
61 |
"""
|
62 |
(layman) Show a friendly site name like "example.com".
|
@@ -338,9 +349,75 @@ def ddg_unstructured( # <-- MCP tool #3 (Unstructured DDG)
|
|
338 |
return results
|
339 |
|
340 |
|
341 |
-
#
|
342 |
-
#
|
343 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
# --- Fetch tab (compact controllable extraction) ---
|
346 |
fetch_interface = gr.Interface(
|
@@ -381,17 +458,36 @@ unstructured_interface = gr.Interface(
|
|
381 |
inputs=gr.Textbox(label="Enter Search Query"), # (layman) same single input
|
382 |
outputs=gr.Textbox(label="Results", interactive=False), # (layman) Textbox showing str(list[dict])
|
383 |
title="Unstructured DDG (Raw List)", # (layman) clear label
|
384 |
-
description="Returns the raw list of results (list[dict]) shown as text.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
allow_flagging="never",
|
386 |
theme="Nymbo/Nymbo_Theme",
|
387 |
-
submit_btn="Search",
|
388 |
)
|
389 |
|
390 |
# --- Combine all into a single app with tabs ---
|
391 |
demo = gr.TabbedInterface(
|
392 |
-
interface_list=[fetch_interface, websearch_interface, unstructured_interface],
|
393 |
-
tab_names=["Fetch", "Websearch", "Unstructured DDG"],
|
394 |
-
title="Web MCP — Fetch + Websearch + Unstructured
|
395 |
theme="Nymbo/Nymbo_Theme",
|
396 |
)
|
397 |
|
|
|
1 |
# File: main/app.py
|
2 |
+
# Purpose: One Space that offers four tools/tabs:
|
3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
|
5 |
+
# 3) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
|
6 |
+
# 4) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
|
7 |
#
|
8 |
# Launched with mcp_server=True so all functions are available as MCP tools.
|
9 |
|
10 |
from __future__ import annotations
|
11 |
|
12 |
+
import re # (layman) whitespace cleanup and trimming
|
13 |
+
import json # (layman) to produce compact JSON lines
|
14 |
from typing import List, Dict, Literal, Tuple
|
15 |
|
16 |
+
import gradio as gr # (layman) UI framework
|
17 |
+
import requests # (layman) to download web pages
|
18 |
+
from bs4 import BeautifulSoup # (layman) HTML parsing
|
19 |
+
from readability import Document # (layman) isolate main article content
|
20 |
+
from urllib.parse import urljoin, urldefrag, urlparse # (layman) URL helpers
|
21 |
|
22 |
# Structured search via LangChain community tool
|
23 |
from langchain_community.tools import DuckDuckGoSearchResults
|
24 |
+
# Native DDG client (used by Unstructured + Concise tabs)
|
25 |
from duckduckgo_search import DDGS
|
26 |
|
27 |
|
|
|
59 |
return text[:max_chars].rstrip() + " …", True
|
60 |
|
61 |
|
62 |
+
def _shorten(text: str, limit: int) -> str:
|
63 |
+
"""
|
64 |
+
(layman) Hard cap a string with an ellipsis to keep tokens small.
|
65 |
+
"""
|
66 |
+
if limit <= 0 or len(text) <= limit:
|
67 |
+
return text
|
68 |
+
return text[: max(0, limit - 1)].rstrip() + "…"
|
69 |
+
|
70 |
+
|
71 |
def _domain_of(url: str) -> str:
|
72 |
"""
|
73 |
(layman) Show a friendly site name like "example.com".
|
|
|
349 |
return results
|
350 |
|
351 |
|
352 |
+
# ============================================
|
353 |
+
# Concise DDG: ultra-succinct JSONL for tokens
|
354 |
+
# ============================================
|
355 |
+
|
356 |
+
def ddg_concise( # <-- MCP tool #4 (Concise DDG)
|
357 |
+
query: str,
|
358 |
+
max_results: int = 5,
|
359 |
+
include_snippets: bool = False,
|
360 |
+
max_snippet_chars: int = 80,
|
361 |
+
dedupe_domains: bool = True,
|
362 |
+
title_chars: int = 80,
|
363 |
+
) -> str:
|
364 |
+
"""
|
365 |
+
(layman) Minimal-output DuckDuckGo search designed to reduce tokens:
|
366 |
+
- Returns newline-delimited JSON (JSONL) with short keys:
|
367 |
+
t=title, u=url, s=snippet (optional)
|
368 |
+
- Titles and snippets are hard-capped to save tokens.
|
369 |
+
- Domain dedupe on by default to avoid near-duplicates.
|
370 |
+
|
371 |
+
Returns:
|
372 |
+
A compact string like:
|
373 |
+
{"t":"Example","u":"https://example.com/x"}
|
374 |
+
{"t":"Another…","u":"https://a.com/y","s":"Short snippet…"}
|
375 |
+
"""
|
376 |
+
# Fast guard for empty input
|
377 |
+
if not query or not query.strip():
|
378 |
+
return ""
|
379 |
+
|
380 |
+
# Perform the search
|
381 |
+
try:
|
382 |
+
with DDGS() as ddgs:
|
383 |
+
raw = ddgs.text(query, max_results=max_results)
|
384 |
+
except Exception as e:
|
385 |
+
# (layman) Minimal error payload to keep tokens low
|
386 |
+
return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
|
387 |
+
|
388 |
+
# Optionally dedupe by domain to reduce redundant lines
|
389 |
+
seen_domains = set()
|
390 |
+
lines: List[str] = []
|
391 |
+
|
392 |
+
for r in raw or []:
|
393 |
+
title = _shorten((r.get("title") or "").strip(), title_chars)
|
394 |
+
url = (r.get("href") or r.get("link") or "").strip()
|
395 |
+
body = (r.get("body") or r.get("snippet") or "").strip()
|
396 |
+
|
397 |
+
if not url:
|
398 |
+
continue
|
399 |
+
|
400 |
+
if dedupe_domains:
|
401 |
+
dom = _domain_of(url)
|
402 |
+
if dom in seen_domains:
|
403 |
+
continue
|
404 |
+
seen_domains.add(dom)
|
405 |
+
|
406 |
+
obj = {"t": title or _domain_of(url), "u": url}
|
407 |
+
|
408 |
+
if include_snippets and body:
|
409 |
+
obj["s"] = _shorten(body, max_snippet_chars)
|
410 |
+
|
411 |
+
# Emit most compact JSON possible (no spaces)
|
412 |
+
lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
|
413 |
+
|
414 |
+
# Join as JSONL (each result on its own line)
|
415 |
+
return "\n".join(lines)
|
416 |
+
|
417 |
+
|
418 |
+
# ======================
|
419 |
+
# UI: four-tab interface
|
420 |
+
# ======================
|
421 |
|
422 |
# --- Fetch tab (compact controllable extraction) ---
|
423 |
fetch_interface = gr.Interface(
|
|
|
458 |
inputs=gr.Textbox(label="Enter Search Query"), # (layman) same single input
|
459 |
outputs=gr.Textbox(label="Results", interactive=False), # (layman) Textbox showing str(list[dict])
|
460 |
title="Unstructured DDG (Raw List)", # (layman) clear label
|
461 |
+
description="Returns the raw list of results (list[dict]) shown as text.",
|
462 |
+
allow_flagging="never",
|
463 |
+
theme="Nymbo/Nymbo_Theme",
|
464 |
+
submit_btn="Search",
|
465 |
+
)
|
466 |
+
|
467 |
+
# --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
|
468 |
+
concise_interface = gr.Interface(
|
469 |
+
fn=ddg_concise, # (layman) ultra-succinct search
|
470 |
+
inputs=[
|
471 |
+
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
472 |
+
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
473 |
+
gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
|
474 |
+
gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
|
475 |
+
gr.Checkbox(value=True, label="Dedupe by domain"),
|
476 |
+
gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
|
477 |
+
],
|
478 |
+
outputs=gr.Textbox(label="Results (JSONL)", interactive=False), # (layman) copy-friendly minimal output
|
479 |
+
title="DDG (Concise) — Minimal Tokens",
|
480 |
+
description="Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.",
|
481 |
allow_flagging="never",
|
482 |
theme="Nymbo/Nymbo_Theme",
|
483 |
+
submit_btn="Search",
|
484 |
)
|
485 |
|
486 |
# --- Combine all into a single app with tabs ---
|
487 |
demo = gr.TabbedInterface(
|
488 |
+
interface_list=[fetch_interface, websearch_interface, unstructured_interface, concise_interface],
|
489 |
+
tab_names=["Fetch", "Websearch", "Unstructured DDG", "DDG (Concise)"],
|
490 |
+
title="Web MCP — Fetch + Websearch + Unstructured + Concise",
|
491 |
theme="Nymbo/Nymbo_Theme",
|
492 |
)
|
493 |
|