Nymbo commited on
Commit
a655b89
·
verified ·
1 Parent(s): 8d4d649

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: main/app.py
2
+ # Purpose: One Space that offers two tools:
3
+ # 1) Fetch: extract relevant page content (title, metadata, clean text, hyperlinks)
4
+ # 2) Websearch: DuckDuckGo web search
5
+ #
6
+ # Notes:
7
+ # - Launched with mcp_server=True so both functions are available as MCP tools.
8
+ # - UI uses TabbedInterface so you can use each tool from its own tab.
9
+ # - Inline comments explain each section in plain language.
10
+
11
+ from __future__ import annotations
12
+
13
+ import re # (layman) used to tidy up whitespace
14
+ from typing import List, Dict, Literal, Tuple
15
+
16
+ import gradio as gr # (layman) the UI framework
17
+ import requests # (layman) to download web pages
18
+ from bs4 import BeautifulSoup # (layman) for parsing HTML
19
+ from readability import Document # (layman) to isolate main readable content
20
+ from urllib.parse import urljoin, urldefrag, urlparse # (layman) to fix/clean URLs
21
+
22
+ # DuckDuckGo via LangChain community tool
23
+ from langchain_community.tools import DuckDuckGoSearchResults
24
+
25
+
26
+ # ==============================
27
+ # Fetch: HTTP + extraction utils
28
+ # ==============================
29
+
30
+ def _http_get(url: str) -> requests.Response:
31
+ """
32
+ (layman) Download the page politely with a short timeout and realistic headers.
33
+ """
34
+ headers = {
35
+ "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
36
+ "Accept-Language": "en-US,en;q=0.9",
37
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
38
+ }
39
+ return requests.get(url, headers=headers, timeout=15)
40
+
41
+
42
+ def _normalize_whitespace(text: str) -> str:
43
+ """
44
+ (layman) Squeeze extra spaces and blank lines to keep things compact.
45
+ """
46
+ text = re.sub(r"[ \t\u00A0]+", " ", text)
47
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
48
+ return text.strip()
49
+
50
+
51
+ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
52
+ """
53
+ (layman) Cut text if it gets too long; return the text and whether we trimmed.
54
+ """
55
+ if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
56
+ return text, False
57
+ return text[:max_chars].rstrip() + " …", True
58
+
59
+
60
+ def _domain_of(url: str) -> str:
61
+ """
62
+ (layman) Show a friendly site name like "example.com".
63
+ """
64
+ try:
65
+ return urlparse(url).netloc or ""
66
+ except Exception:
67
+ return ""
68
+
69
+
70
+ def _meta(soup: BeautifulSoup, name: str) -> str | None:
71
+ tag = soup.find("meta", attrs={"name": name})
72
+ return tag.get("content") if tag and tag.has_attr("content") else None
73
+
74
+
75
+ def _og(soup: BeautifulSoup, prop: str) -> str | None:
76
+ tag = soup.find("meta", attrs={"property": prop})
77
+ return tag.get("content") if tag and tag.has_attr("content") else None
78
+
79
+
80
+ def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
81
+ """
82
+ (layman) Pull the useful bits: title, description, site name, canonical URL, language, etc.
83
+ """
84
+ meta: Dict[str, str] = {}
85
+
86
+ # Title preference: <title> > og:title > twitter:title
87
+ title_candidates = [
88
+ (soup.title.string if soup.title and soup.title.string else None),
89
+ _og(soup, "og:title"),
90
+ _meta(soup, "twitter:title"),
91
+ ]
92
+ meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
93
+
94
+ # Description preference: description > og:description > twitter:description
95
+ desc_candidates = [
96
+ _meta(soup, "description"),
97
+ _og(soup, "og:description"),
98
+ _meta(soup, "twitter:description"),
99
+ ]
100
+ meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
101
+
102
+ # Canonical link (helps dedupe)
103
+ link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
104
+ meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
105
+
106
+ # Site name + language info if present
107
+ meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
108
+ html_tag = soup.find("html")
109
+ meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
110
+
111
+ # Final URL + domain
112
+ meta["fetched_url"] = final_url
113
+ meta["domain"] = _domain_of(final_url)
114
+
115
+ return meta
116
+
117
+
118
+ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
119
+ """
120
+ (layman) Use Readability to isolate the main article and turn it into clean text.
121
+ Returns (clean_text, soup_of_readable_html).
122
+ """
123
+ # Simplified article HTML from Readability
124
+ doc = Document(html)
125
+ readable_html = doc.summary(html_partial=True)
126
+
127
+ # Parse simplified HTML
128
+ s = BeautifulSoup(readable_html, "lxml")
129
+
130
+ # Remove noisy tags
131
+ for sel in ["script", "style", "noscript", "iframe", "svg"]:
132
+ for tag in s.select(sel):
133
+ tag.decompose()
134
+
135
+ # Keep paragraphs, list items, and subheadings for structure without bloat
136
+ text_parts: List[str] = []
137
+ for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
138
+ chunk = p.get_text(" ", strip=True)
139
+ if chunk:
140
+ text_parts.append(chunk)
141
+
142
+ clean_text = _normalize_whitespace("\n\n".join(text_parts))
143
+ return clean_text, s
144
+
145
+
146
+ def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
147
+ """
148
+ (layman) Collect clean, unique, absolute links from the readable section only.
149
+ """
150
+ seen = set()
151
+ links: List[Tuple[str, str]] = []
152
+
153
+ for a in readable_soup.find_all("a", href=True):
154
+ href = a.get("href").strip()
155
+ # Skip junk links we can't use
156
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
157
+ continue
158
+
159
+ # Resolve relative URLs, strip fragments (#…)
160
+ absolute = urljoin(base_url, href)
161
+ absolute, _ = urldefrag(absolute)
162
+
163
+ if absolute in seen:
164
+ continue
165
+ seen.add(absolute)
166
+
167
+ text = a.get_text(" ", strip=True)
168
+ if len(text) > 120:
169
+ text = text[:117] + "…"
170
+
171
+ links.append((text or absolute, absolute))
172
+
173
+ if len(links) >= max_links > 0:
174
+ break
175
+
176
+ return links
177
+
178
+
179
+ def _format_markdown(
180
+ meta: Dict[str, str],
181
+ body: str,
182
+ body_truncated: bool,
183
+ links: List[Tuple[str, str]],
184
+ include_text: bool,
185
+ include_metadata: bool,
186
+ include_links: bool,
187
+ verbosity: str,
188
+ ) -> str:
189
+ """
190
+ (layman) Assemble a compact Markdown summary with optional sections.
191
+ """
192
+ lines: List[str] = []
193
+
194
+ # Title header
195
+ title = meta.get("title") or meta.get("domain") or "Untitled"
196
+ lines.append(f"# {title}")
197
+
198
+ # Metadata section (only show what exists)
199
+ if include_metadata:
200
+ md: List[str] = []
201
+ if meta.get("description"):
202
+ md.append(f"- **Description:** {meta['description']}")
203
+ if meta.get("site_name"):
204
+ md.append(f"- **Site:** {meta['site_name']}")
205
+ if meta.get("canonical"):
206
+ md.append(f"- **Canonical:** {meta['canonical']}")
207
+ if meta.get("lang"):
208
+ md.append(f"- **Language:** {meta['lang']}")
209
+ if meta.get("fetched_url"):
210
+ md.append(f"- **Fetched From:** {meta['fetched_url']}")
211
+ if md:
212
+ lines.append("## Metadata")
213
+ lines.extend(md)
214
+
215
+ # Body text
216
+ if include_text and body:
217
+ if verbosity == "Brief":
218
+ brief, was_more = _truncate(body, 800)
219
+ lines.append("## Text")
220
+ lines.append(brief)
221
+ if was_more or body_truncated:
222
+ lines.append("\n> (Trimmed for brevity)")
223
+ else:
224
+ lines.append("## Text")
225
+ lines.append(body)
226
+ if body_truncated:
227
+ lines.append("\n> (Trimmed for brevity)")
228
+
229
+ # Links section
230
+ if include_links and links:
231
+ lines.append(f"## Links ({len(links)})")
232
+ for text, url in links:
233
+ lines.append(f"- [{text}]({url})")
234
+
235
+ return "\n\n".join(lines).strip()
236
+
237
+
238
+ def extract_relevant( # <-- MCP tool #1
239
+ url: str,
240
+ verbosity: str = "Standard",
241
+ include_metadata: bool = True,
242
+ include_text: bool = True,
243
+ include_links: bool = True,
244
+ max_chars: int = 3000,
245
+ max_links: int = 20,
246
+ ) -> str:
247
+ """
248
+ (layman) Given a URL, return a tight Markdown summary: title, key metadata, readable text, and links.
249
+ """
250
+ if not url or not url.strip():
251
+ return "Please enter a valid URL."
252
+
253
+ try:
254
+ resp = _http_get(url)
255
+ resp.raise_for_status()
256
+ except requests.exceptions.RequestException as e:
257
+ return f"An error occurred: {e}"
258
+
259
+ final_url = str(resp.url)
260
+ ctype = resp.headers.get("Content-Type", "")
261
+ if "html" not in ctype.lower():
262
+ return f"Unsupported content type for extraction: {ctype or 'unknown'}"
263
+
264
+ # Decode to text
265
+ resp.encoding = resp.encoding or resp.apparent_encoding
266
+ html = resp.text
267
+
268
+ # Full-page soup for metadata
269
+ full_soup = BeautifulSoup(html, "lxml")
270
+ meta = _extract_metadata(full_soup, final_url)
271
+
272
+ # Readable content
273
+ body_text, readable_soup = _extract_main_text(html)
274
+ if not body_text:
275
+ # Fallback to "whole-page text" if Readability found nothing
276
+ fallback_text = full_soup.get_text(" ", strip=True)
277
+ body_text = _normalize_whitespace(fallback_text)
278
+
279
+ # Verbosity presets (we keep the smaller of preset vs. user cap)
280
+ preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
281
+ target_cap = preset_caps.get(verbosity, 3000)
282
+ cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
283
+ body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
284
+
285
+ # Extract links from the simplified content only
286
+ links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
287
+
288
+ # Final compact Markdown
289
+ md = _format_markdown(
290
+ meta=meta,
291
+ body=body_text,
292
+ body_truncated=truncated,
293
+ links=links,
294
+ include_text=include_text,
295
+ include_metadata=include_metadata,
296
+ include_links=include_links,
297
+ verbosity=verbosity,
298
+ )
299
+ return md or "No content could be extracted."
300
+
301
+
302
+ # ==========================
303
+ # Websearch: DuckDuckGo tool
304
+ # ==========================
305
+
306
+ def web_search( # <-- MCP tool #2
307
+ input_query: str,
308
+ max_results: int = 5,
309
+ ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
310
+ """
311
+ (layman) Run a DuckDuckGo search and return a list of {snippet, title, link}.
312
+ """
313
+ if not input_query or not input_query.strip():
314
+ return []
315
+
316
+ # Create the search tool (LangChain community wrapper)
317
+ search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)
318
+
319
+ # Run the search and return results as a list of dicts
320
+ results = search.invoke(input_query)
321
+ return results
322
+
323
+
324
+ # =====================
325
+ # UI: two-tab interface
326
+ # =====================
327
+
328
+ # --- Fetch tab (compact controllable extraction) ---
329
+ fetch_interface = gr.Interface(
330
+ fn=extract_relevant, # (layman) connect the function to the UI
331
+ inputs=[
332
+ gr.Textbox(label="URL", placeholder="https://example.com/article"),
333
+ gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
334
+ gr.Checkbox(value=True, label="Include Metadata"),
335
+ gr.Checkbox(value=True, label="Include Main Text"),
336
+ gr.Checkbox(value=True, label="Include Links"),
337
+ gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
338
+ gr.Slider(0, 100, value=20, step=1, label="Max Links"),
339
+ ],
340
+ outputs=gr.Markdown(label="Extracted Summary"),
341
+ title="Fetch — Clean Extract",
342
+ description="Extract title, key metadata, readable text, and links. No noisy HTML.",
343
+ allow_flagging="never",
344
+ theme="Nymbo/Nymbo_Theme",
345
+ )
346
+
347
+ # --- Websearch tab (DuckDuckGo) ---
348
+ websearch_interface = gr.Interface(
349
+ fn=web_search, # (layman) connect the function to the UI
350
+ inputs=[
351
+ gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
352
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
353
+ ],
354
+ outputs=gr.JSON(label="Search results"),
355
+ title="Websearch — DuckDuckGo",
356
+ description="Search the web using DuckDuckGo; returns snippet, title, and link.",
357
+ allow_flagging="never",
358
+ theme="Nymbo/Nymbo_Theme",
359
+ )
360
+
361
+ # --- Combine both into a single app with tabs ---
362
+ demo = gr.TabbedInterface(
363
+ interface_list=[fetch_interface, websearch_interface],
364
+ tab_names=["Fetch", "Websearch"],
365
+ title="Web MCP — Fetch + Websearch",
366
+ theme="Nymbo/Nymbo_Theme",
367
+ )
368
+
369
+ # Launch the UI and expose both functions as MCP tools in one server
370
+ if __name__ == "__main__":
371
+ demo.launch(mcp_server=True)