File size: 3,813 Bytes
60bdd74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f890eb5
60bdd74
 
ac9f3b0
60bdd74
 
 
 
 
ac9f3b0
60bdd74
ac9f3b0
60bdd74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac9f3b0
60bdd74
ac9f3b0
 
60bdd74
ac9f3b0
60bdd74
 
ac9f3b0
f890eb5
60bdd74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
321422d
60bdd74
f890eb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# app.py
# Hugging Face Space: Cleaner web-page fetcher
# -------------------------------------------------------------
# Fetches a URL and returns a concise, human-readable snapshot:
#   β€’ Title
#   β€’ Meta description
#   β€’ Main text (readability-extracted)
#   β€’ Hyperlinks (anchor text β†’ absolute URL)
# -------------------------------------------------------------

import requests                         # HTTP client
from bs4 import BeautifulSoup           # HTML parsing
from readability import Document        # Boiler-plate removal
from urllib.parse import urljoin        # Build absolute link URLs
import gradio as gr                     # UI framework

def extract_relevant_text(html: str, base_url: str) -> str:
    """
    Convert raw HTML into a clean, plain-text summary.
    - html: the page's HTML source
    - base_url: needed for resolving relative <a href="">
    Returns a formatted string ready for display.
    """
    # 1) Let readability isolate the primary article/content
    doc = Document(html)
    title = doc.short_title()

    summary_html = doc.summary()  # cleaned, minimal HTML
    summary_soup = BeautifulSoup(summary_html, "lxml")

    # 2) Grab visible paragraph & list text
    body_parts = [
        tag.get_text(" ", strip=True)
        for tag in summary_soup.find_all(["p", "li"])
        if tag.get_text(strip=True)
    ]
    main_text = "\n\n".join(body_parts) or "[No main text extracted]"

    # 3) Extract meta description from the *full* document
    full_soup = BeautifulSoup(html, "lxml")
    meta_desc = ""
    meta_tag = full_soup.find("meta", attrs={"name": "description"})
    if meta_tag and meta_tag.get("content"):
        meta_desc = meta_tag["content"].strip()
    else:  # Fallback to Open Graph description
        og_tag = full_soup.find("meta", attrs={"property": "og:description"})
        if og_tag and og_tag.get("content"):
            meta_desc = og_tag["content"].strip()

    # 4) Build a neat list of hyperlinks (anchor text β†’ absolute URL)
    links = []
    for a in summary_soup.find_all("a", href=True):
        href_abs = urljoin(base_url, a["href"])
        text = a.get_text(" ", strip=True) or "[link]"
        links.append(f"β€’ {text} β†’ {href_abs}")

    # 5) Compose the final plaintext output
    sections = [
        f"Title: {title}",
        f"Description: {meta_desc or '[None]'}",
        f"Body:\n{main_text}",
        "Links:\n" + ("\n".join(links) if links else "[No links]")
    ]
    return "\n\n".join(sections)


def fetch_content(url: str) -> str:
    """
    Fetch the URL and return a concise summary.
    Includes basic error handling for network issues.
    """
    try:
        # Friendly user-agent prevents some 403s
        headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()  # 4xx/5xx β†’ exception

        return extract_relevant_text(response.text, url)

    except requests.exceptions.RequestException as err:
        # Any network or HTTP error bubbles up here
        return f"[Error] {err}"


# -------------------------- Gradio UI --------------------------
demo = gr.Interface(
    fn=fetch_content,
    inputs=gr.Textbox(label="URL", placeholder="https://example.com"),
    outputs=gr.Textbox(
        label="Clean Page Snapshot",
        interactive=False,
        lines=25,                 # taller box for readability
    ),
    title="Clean Web Snapshot",
    description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).",
    allow_flagging="never",
    theme="Nymbo/Nymbo_Theme",
)

if __name__ == "__main__":
    # Expose as an MCP server so you can chain it with other Spaces
    demo.launch(mcp_server=True)