File size: 5,388 Bytes
32db98e
 
 
 
ed27cf5
32db98e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c6d66
32db98e
 
 
e7c6d66
 
32db98e
 
 
 
 
 
 
 
 
321422d
32db98e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c6d66
32db98e
321422d
32db98e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File: app.py
# Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
#          relevant human-readable text instead of the entire HTML.
#          Includes robust error handling, timeouts, and fallbacks.

import gradio as gr               # UI framework
import requests                   # makes the web request
from bs4 import BeautifulSoup     # parses HTML so we can work with it
from readability import Document  # distills a page down to its "main article" content
import html                       # unescapes HTML entities like & → &
import re                         # simple cleanup with regex

# ---- helper: clean up text nicely -------------------------------------------
def _normalize_text(text: str) -> str:
    """
    Layman's terms: This tidies up the text we extracted so it looks nice.
    - Converts & things back to normal characters
    - Collapses too many blank lines
    - Trims leading/trailing whitespace
    """
    text = html.unescape(text)
    # Replace Windows/Mac line endings with Unix and normalize spaces
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    # Collapse 3+ newlines down to 2
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

# ---- core fetcher: return main text or raw HTML ------------------------------
def fetch_page(url: str, extract_text: bool = True) -> str:
    """
    Layman's terms: We download the web page. If 'extract_text' is True,
    we try to grab only the main article/important text. Otherwise we
    return the raw HTML (like your original app).
    """
    try:
        # Make the request with a friendly browser-like header and a timeout
        resp = requests.get(
            url,
            headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
            timeout=15,
            allow_redirects=True,
        )
        resp.raise_for_status()  # If site returns 4xx/5xx, this will raise an error

    except requests.exceptions.RequestException as e:
        # Layman's terms: If anything goes wrong with the request, report it nicely.
        return f"Request error: {e}"

    # If the user wants full HTML, behave like the original version
    if not extract_text:
        return resp.text

    # Try readability first (usually best for articles/blog posts)
    try:
        # readability extracts the "main" content and returns HTML of just that part
        doc = Document(resp.text)
        main_html = doc.summary(html_partial=True)

        # Parse the article-only HTML and get just the visible text
        soup = BeautifulSoup(main_html, "lxml")
        # Remove script/style just in case
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        main_text = soup.get_text(separator="\n")
        main_text = _normalize_text(main_text)

        # Fallback: if extraction produced nearly nothing, try a simpler approach
        if len(main_text.split()) < 40:
            raise ValueError("Readability extraction too short; falling back")

        return main_text

    except Exception:
        # Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
        try:
            soup = BeautifulSoup(resp.text, "lxml")

            # Remove common noise: scripts, styles, nav, footer, header, forms
            for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
                tag.decompose()

            # If there's a <main> or an article-like block, prefer that
            candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
            if candidate:
                text = candidate.get_text(separator="\n")
            else:
                text = soup.get_text(separator="\n")

            return _normalize_text(text)

        except Exception as e:
            # Last resort: give raw HTML if even fallback parsing fails
            return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"

# ---- Gradio UI ---------------------------------------------------------------
# Layman's terms: This is the app window. You paste a URL and choose whether to
# extract readable text or keep full HTML. Then click "Fetch".
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
    gr.Markdown(
        """
# Fetch MCP
Small utility that fetches a web page and returns **just the readable text** by default  
*(toggle off to get the full HTML like before)*.
        """
    )

    with gr.Row():
        url_input = gr.Textbox(
            label="URL",
            placeholder="https://example.com/article",
            lines=1,
        )
    with gr.Row():
        extract_toggle = gr.Checkbox(
            value=True,
            label="Extract only the main readable text (recommended)",
        )

    fetch_btn = gr.Button("Fetch", variant="primary")

    # Output as plain text so it’s easy to copy or pipe into other tools
    output = gr.Textbox(
        label="Output",
        lines=20,
        interactive=False,
        placeholder="Fetched content will appear here…",
    )

    # Wire the button to our function
    fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)

# Run as normal, keeping MCP server enabled
if __name__ == "__main__":
    demo.launch(mcp_server=True)