|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
from urllib.parse import urljoin |
|
import gradio as gr |
|
|
|
def extract_relevant_text(html: str, base_url: str) -> str: |
|
""" |
|
Convert raw HTML into a clean, plain-text summary. |
|
- html: the page's HTML source |
|
- base_url: needed for resolving relative <a href=""> |
|
Returns a formatted string ready for display. |
|
""" |
|
|
|
doc = Document(html) |
|
title = doc.short_title() |
|
|
|
summary_html = doc.summary() |
|
summary_soup = BeautifulSoup(summary_html, "lxml") |
|
|
|
|
|
body_parts = [ |
|
tag.get_text(" ", strip=True) |
|
for tag in summary_soup.find_all(["p", "li"]) |
|
if tag.get_text(strip=True) |
|
] |
|
main_text = "\n\n".join(body_parts) or "[No main text extracted]" |
|
|
|
|
|
full_soup = BeautifulSoup(html, "lxml") |
|
meta_desc = "" |
|
meta_tag = full_soup.find("meta", attrs={"name": "description"}) |
|
if meta_tag and meta_tag.get("content"): |
|
meta_desc = meta_tag["content"].strip() |
|
else: |
|
og_tag = full_soup.find("meta", attrs={"property": "og:description"}) |
|
if og_tag and og_tag.get("content"): |
|
meta_desc = og_tag["content"].strip() |
|
|
|
|
|
links = [] |
|
for a in summary_soup.find_all("a", href=True): |
|
href_abs = urljoin(base_url, a["href"]) |
|
text = a.get_text(" ", strip=True) or "[link]" |
|
links.append(f"β’ {text} β {href_abs}") |
|
|
|
|
|
sections = [ |
|
f"Title: {title}", |
|
f"Description: {meta_desc or '[None]'}", |
|
f"Body:\n{main_text}", |
|
"Links:\n" + ("\n".join(links) if links else "[No links]") |
|
] |
|
return "\n\n".join(sections) |
|
|
|
|
|
def fetch_content(url: str) -> str: |
|
""" |
|
Fetch the URL and return a concise summary. |
|
Includes basic error handling for network issues. |
|
""" |
|
try: |
|
|
|
headers = {"User-Agent": "Mozilla/5.0 (compatible; CleanFetcher/1.0)"} |
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
return extract_relevant_text(response.text, url) |
|
|
|
except requests.exceptions.RequestException as err: |
|
|
|
return f"[Error] {err}" |
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=fetch_content, |
|
inputs=gr.Textbox(label="URL", placeholder="https://example.com"), |
|
outputs=gr.Textbox( |
|
label="Clean Page Snapshot", |
|
interactive=False, |
|
lines=25, |
|
), |
|
title="Clean Web Snapshot", |
|
description="Enter a URL to retrieve a tidy text summary (title, description, main content, and links).", |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch(mcp_server=True) |
|
|