Update app.py
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ from duckduckgo_search import DDGS
|
|
26 |
|
27 |
def _http_get(url: str) -> requests.Response:
|
28 |
"""
|
29 |
-
|
30 |
"""
|
31 |
headers = {
|
32 |
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
|
@@ -38,7 +38,7 @@ def _http_get(url: str) -> requests.Response:
|
|
38 |
|
39 |
def _normalize_whitespace(text: str) -> str:
|
40 |
"""
|
41 |
-
|
42 |
"""
|
43 |
text = re.sub(r"[ \t\u00A0]+", " ", text)
|
44 |
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
|
@@ -47,7 +47,7 @@ def _normalize_whitespace(text: str) -> str:
|
|
47 |
|
48 |
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
49 |
"""
|
50 |
-
|
51 |
"""
|
52 |
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
|
53 |
return text, False
|
@@ -56,7 +56,7 @@ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
|
56 |
|
57 |
def _shorten(text: str, limit: int) -> str:
|
58 |
"""
|
59 |
-
|
60 |
"""
|
61 |
if limit <= 0 or len(text) <= limit:
|
62 |
return text
|
@@ -65,7 +65,7 @@ def _shorten(text: str, limit: int) -> str:
|
|
65 |
|
66 |
def _domain_of(url: str) -> str:
|
67 |
"""
|
68 |
-
|
69 |
"""
|
70 |
try:
|
71 |
return urlparse(url).netloc or ""
|
@@ -85,7 +85,7 @@ def _og(soup: BeautifulSoup, prop: str) -> str | None:
|
|
85 |
|
86 |
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
|
87 |
"""
|
88 |
-
|
89 |
"""
|
90 |
meta: Dict[str, str] = {}
|
91 |
|
@@ -123,7 +123,7 @@ def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
|
|
123 |
|
124 |
def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
125 |
"""
|
126 |
-
|
127 |
Returns (clean_text, soup_of_readable_html).
|
128 |
"""
|
129 |
# Simplified article HTML from Readability
|
@@ -151,7 +151,7 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
151 |
|
152 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
153 |
"""
|
154 |
-
|
155 |
"""
|
156 |
seen = set()
|
157 |
links: List[Tuple[str, str]] = []
|
@@ -193,7 +193,7 @@ def _format_markdown(
|
|
193 |
verbosity: str,
|
194 |
) -> str:
|
195 |
"""
|
196 |
-
|
197 |
"""
|
198 |
lines: List[str] = []
|
199 |
|
@@ -412,7 +412,7 @@ def Search_Concise( # <-- MCP tool #4 (Concise DDG)
|
|
412 |
|
413 |
# --- Fetch tab (compact controllable extraction) ---
|
414 |
fetch_interface = gr.Interface(
|
415 |
-
fn=Fetch_Webpage, #
|
416 |
inputs=[
|
417 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
418 |
gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
|
@@ -431,7 +431,7 @@ fetch_interface = gr.Interface(
|
|
431 |
|
432 |
# --- Websearch tab (structured DDG via LangChain) ---
|
433 |
websearch_interface = gr.Interface(
|
434 |
-
fn=Search_Structured, #
|
435 |
inputs=[
|
436 |
gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
|
437 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
|
|
26 |
|
27 |
def _http_get(url: str) -> requests.Response:
|
28 |
"""
|
29 |
+
Download the page politely with a short timeout and realistic headers.
|
30 |
"""
|
31 |
headers = {
|
32 |
"User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
|
|
|
38 |
|
39 |
def _normalize_whitespace(text: str) -> str:
|
40 |
"""
|
41 |
+
Squeeze extra spaces and blank lines to keep things compact.
|
42 |
"""
|
43 |
text = re.sub(r"[ \t\u00A0]+", " ", text)
|
44 |
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
|
|
|
47 |
|
48 |
def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
|
49 |
"""
|
50 |
+
Cut text if it gets too long; return the text and whether we trimmed.
|
51 |
"""
|
52 |
if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
|
53 |
return text, False
|
|
|
56 |
|
57 |
def _shorten(text: str, limit: int) -> str:
|
58 |
"""
|
59 |
+
Hard cap a string with an ellipsis to keep tokens small.
|
60 |
"""
|
61 |
if limit <= 0 or len(text) <= limit:
|
62 |
return text
|
|
|
65 |
|
66 |
def _domain_of(url: str) -> str:
|
67 |
"""
|
68 |
+
Show a friendly site name like "example.com".
|
69 |
"""
|
70 |
try:
|
71 |
return urlparse(url).netloc or ""
|
|
|
85 |
|
86 |
def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
|
87 |
"""
|
88 |
+
Pull the useful bits: title, description, site name, canonical URL, language, etc.
|
89 |
"""
|
90 |
meta: Dict[str, str] = {}
|
91 |
|
|
|
123 |
|
124 |
def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
125 |
"""
|
126 |
+
Use Readability to isolate the main article and turn it into clean text.
|
127 |
Returns (clean_text, soup_of_readable_html).
|
128 |
"""
|
129 |
# Simplified article HTML from Readability
|
|
|
151 |
|
152 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
153 |
"""
|
154 |
+
Collect clean, unique, absolute links from the readable section only.
|
155 |
"""
|
156 |
seen = set()
|
157 |
links: List[Tuple[str, str]] = []
|
|
|
193 |
verbosity: str,
|
194 |
) -> str:
|
195 |
"""
|
196 |
+
Assemble a compact Markdown summary with optional sections.
|
197 |
"""
|
198 |
lines: List[str] = []
|
199 |
|
|
|
412 |
|
413 |
# --- Fetch tab (compact controllable extraction) ---
|
414 |
fetch_interface = gr.Interface(
|
415 |
+
fn=Fetch_Webpage, # connect the function to the UI
|
416 |
inputs=[
|
417 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
418 |
gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
|
|
|
431 |
|
432 |
# --- Websearch tab (structured DDG via LangChain) ---
|
433 |
websearch_interface = gr.Interface(
|
434 |
+
fn=Search_Structured, # connect the function to the UI
|
435 |
inputs=[
|
436 |
gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
|
437 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|