Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Fetch / app.py

Nymbo

Update app.py

8e5c5da verified about 1 month ago

raw

history blame

4.55 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import urllib.parse

	def fetch_and_parse_hn(url):
	"""
	This function takes a Hacker News URL, fetches its content, parses it,
	and returns a formatted Markdown string with titles, metadata, and hyperlinks.
	"""
	if not url.strip():
	return "Please enter a URL."

	try:
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
	response = requests.get(url, headers=headers)
	response.raise_for_status() # Raises an HTTPError for bad responses

	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract page title
	page_title = soup.title.string if soup.title else "Hacker News"
	output_md = [f"# {page_title}\n"]

	# HN stories are in 'tr' tags with class 'athing'
	story_rows = soup.find_all('tr', class_='athing')

	if not story_rows:
	return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."

	for story_row in story_rows:
	# --- Story Details (title, link, rank) ---
	title_span = story_row.find('span', class_='titleline')
	if not title_span:
	continue

	rank_span = story_row.find('span', class_='rank')
	rank = rank_span.text.strip() if rank_span else ""

	link_tag = title_span.find('a')
	title = link_tag.text if link_tag else "No Title"
	article_url = link_tag.get('href', '#')

	# Handle relative URLs for internal posts (e.g., "Ask HN:")
	if not article_url.startswith('http'):
	article_url = urllib.parse.urljoin(url, article_url)

	site_span = title_span.find('span', class_='sitebit')
	site = f"({site_span.text})" if site_span else ""

	# --- Metadata (points, user, comments) ---
	# Metadata is in the next 'tr' sibling
	metadata_row = story_row.find_next_sibling('tr')
	if not metadata_row:
	output_md.append(f"{rank} [{title}]({article_url}) {site}\n")
	continue

	subtext = metadata_row.find('td', class_='subtext')
	if not subtext:
	output_md.append(f"{rank} [{title}]({article_url}) {site}\n")
	continue

	score = subtext.find('span', class_='score')
	user = subtext.find('a', class_='hnuser')

	# The comments link is usually the last link in the subtext
	comments_link = subtext.find_all('a')[-1]

	# Build metadata string
	meta_parts = []
	if score:
	meta_parts.append(score.text)
	if user:
	meta_parts.append(f"by {user.text}")
	if comments_link and 'item?id=' in comments_link.get('href', ''):
	comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
	comments_url = urllib.parse.urljoin(url, comments_link['href'])
	meta_parts.append(f"[{comments_text}]({comments_url})")

	metadata_str = " \| ".join(meta_parts)

	# Assemble the final markdown for the item
	output_md.append(f"{rank} [{title}]({article_url}) {site}")
	if metadata_str:
	output_md.append(f" - {metadata_str}\n")

	return "\n".join(output_md)

	except requests.exceptions.RequestException as e:
	return f"An error occurred: {e}"
	except Exception as e:
	return f"An unexpected error occurred during parsing: {e}"

	# Define the Gradio interface
	demo = gr.Interface(
	fn=fetch_and_parse_hn,
	inputs=gr.Textbox(
	label="Hacker News URL",
	placeholder="e.g., https://news.ycombinator.com",
	value="https://news.ycombinator.com"
	),
	outputs=gr.Markdown(label="Hacker News Digest"),
	title="Hacker News Digest Fetcher",
	description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme",
	examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
	)

	if __name__ == "__main__":
	demo.launch()