File size: 4,549 Bytes
8e5c5da e7c6d66 8e5c5da e7c6d66 8e5c5da e7c6d66 8e5c5da 32db98e 321422d 8e5c5da 32db98e 321422d 8e5c5da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse
def fetch_and_parse_hn(url):
"""
This function takes a Hacker News URL, fetches its content, parses it,
and returns a formatted Markdown string with titles, metadata, and hyperlinks.
"""
if not url.strip():
return "Please enter a URL."
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status() # Raises an HTTPError for bad responses
soup = BeautifulSoup(response.text, 'html.parser')
# Extract page title
page_title = soup.title.string if soup.title else "Hacker News"
output_md = [f"# {page_title}\n"]
# HN stories are in 'tr' tags with class 'athing'
story_rows = soup.find_all('tr', class_='athing')
if not story_rows:
return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."
for story_row in story_rows:
# --- Story Details (title, link, rank) ---
title_span = story_row.find('span', class_='titleline')
if not title_span:
continue
rank_span = story_row.find('span', class_='rank')
rank = rank_span.text.strip() if rank_span else ""
link_tag = title_span.find('a')
title = link_tag.text if link_tag else "No Title"
article_url = link_tag.get('href', '#')
# Handle relative URLs for internal posts (e.g., "Ask HN:")
if not article_url.startswith('http'):
article_url = urllib.parse.urljoin(url, article_url)
site_span = title_span.find('span', class_='sitebit')
site = f"({site_span.text})" if site_span else ""
# --- Metadata (points, user, comments) ---
# Metadata is in the next 'tr' sibling
metadata_row = story_row.find_next_sibling('tr')
if not metadata_row:
output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
continue
subtext = metadata_row.find('td', class_='subtext')
if not subtext:
output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
continue
score = subtext.find('span', class_='score')
user = subtext.find('a', class_='hnuser')
# The comments link is usually the last link in the subtext
comments_link = subtext.find_all('a')[-1]
# Build metadata string
meta_parts = []
if score:
meta_parts.append(score.text)
if user:
meta_parts.append(f"by {user.text}")
if comments_link and 'item?id=' in comments_link.get('href', ''):
comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
comments_url = urllib.parse.urljoin(url, comments_link['href'])
meta_parts.append(f"[{comments_text}]({comments_url})")
metadata_str = " | ".join(meta_parts)
# Assemble the final markdown for the item
output_md.append(f"{rank} **[{title}]({article_url})** {site}")
if metadata_str:
output_md.append(f" - *{metadata_str}*\n")
return "\n".join(output_md)
except requests.exceptions.RequestException as e:
return f"An error occurred: {e}"
except Exception as e:
return f"An unexpected error occurred during parsing: {e}"
# Define the Gradio interface
demo = gr.Interface(
fn=fetch_and_parse_hn,
inputs=gr.Textbox(
label="Hacker News URL",
placeholder="e.g., https://news.ycombinator.com",
value="https://news.ycombinator.com"
),
outputs=gr.Markdown(label="Hacker News Digest"),
title="Hacker News Digest Fetcher",
description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
allow_flagging="never",
theme="Nymbo/Nymbo_Theme",
examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
)
if __name__ == "__main__":
demo.launch() |