|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import urllib.parse |
|
|
|
def fetch_and_parse_hn(url): |
|
""" |
|
This function takes a Hacker News URL, fetches its content, parses it, |
|
and returns a formatted Markdown string with titles, metadata, and hyperlinks. |
|
""" |
|
if not url.strip(): |
|
return "Please enter a URL." |
|
|
|
try: |
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} |
|
response = requests.get(url, headers=headers) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
page_title = soup.title.string if soup.title else "Hacker News" |
|
output_md = [f"# {page_title}\n"] |
|
|
|
|
|
story_rows = soup.find_all('tr', class_='athing') |
|
|
|
if not story_rows: |
|
return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL." |
|
|
|
for story_row in story_rows: |
|
|
|
title_span = story_row.find('span', class_='titleline') |
|
if not title_span: |
|
continue |
|
|
|
rank_span = story_row.find('span', class_='rank') |
|
rank = rank_span.text.strip() if rank_span else "" |
|
|
|
link_tag = title_span.find('a') |
|
title = link_tag.text if link_tag else "No Title" |
|
article_url = link_tag.get('href', '#') |
|
|
|
|
|
if not article_url.startswith('http'): |
|
article_url = urllib.parse.urljoin(url, article_url) |
|
|
|
site_span = title_span.find('span', class_='sitebit') |
|
site = f"({site_span.text})" if site_span else "" |
|
|
|
|
|
|
|
metadata_row = story_row.find_next_sibling('tr') |
|
if not metadata_row: |
|
output_md.append(f"{rank} **[{title}]({article_url})** {site}\n") |
|
continue |
|
|
|
subtext = metadata_row.find('td', class_='subtext') |
|
if not subtext: |
|
output_md.append(f"{rank} **[{title}]({article_url})** {site}\n") |
|
continue |
|
|
|
score = subtext.find('span', class_='score') |
|
user = subtext.find('a', class_='hnuser') |
|
|
|
|
|
comments_link = subtext.find_all('a')[-1] |
|
|
|
|
|
meta_parts = [] |
|
if score: |
|
meta_parts.append(score.text) |
|
if user: |
|
meta_parts.append(f"by {user.text}") |
|
if comments_link and 'item?id=' in comments_link.get('href', ''): |
|
comments_text = comments_link.text.replace('\xa0', ' ') |
|
comments_url = urllib.parse.urljoin(url, comments_link['href']) |
|
meta_parts.append(f"[{comments_text}]({comments_url})") |
|
|
|
metadata_str = " | ".join(meta_parts) |
|
|
|
|
|
output_md.append(f"{rank} **[{title}]({article_url})** {site}") |
|
if metadata_str: |
|
output_md.append(f" - *{metadata_str}*\n") |
|
|
|
return "\n".join(output_md) |
|
|
|
except requests.exceptions.RequestException as e: |
|
return f"An error occurred: {e}" |
|
except Exception as e: |
|
return f"An unexpected error occurred during parsing: {e}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=fetch_and_parse_hn, |
|
inputs=gr.Textbox( |
|
label="Hacker News URL", |
|
placeholder="e.g., https://news.ycombinator.com", |
|
value="https://news.ycombinator.com" |
|
), |
|
outputs=gr.Markdown(label="Hacker News Digest"), |
|
title="Hacker News Digest Fetcher", |
|
description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.", |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme", |
|
examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |