File size: 4,549 Bytes
8e5c5da
 
 
 
 
 
e7c6d66
8e5c5da
 
e7c6d66
8e5c5da
 
 
e7c6d66
8e5c5da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
321422d
8e5c5da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32db98e
321422d
8e5c5da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse

def fetch_and_parse_hn(url):
    """
    This function takes a Hacker News URL, fetches its content, parses it,
    and returns a formatted Markdown string with titles, metadata, and hyperlinks.
    """
    if not url.strip():
        return "Please enter a URL."
        
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract page title
        page_title = soup.title.string if soup.title else "Hacker News"
        output_md = [f"# {page_title}\n"]

        # HN stories are in 'tr' tags with class 'athing'
        story_rows = soup.find_all('tr', class_='athing')

        if not story_rows:
            return "Could not find any stories on this page. Please make sure it's a valid Hacker News URL."

        for story_row in story_rows:
            # --- Story Details (title, link, rank) ---
            title_span = story_row.find('span', class_='titleline')
            if not title_span:
                continue

            rank_span = story_row.find('span', class_='rank')
            rank = rank_span.text.strip() if rank_span else ""

            link_tag = title_span.find('a')
            title = link_tag.text if link_tag else "No Title"
            article_url = link_tag.get('href', '#')

            # Handle relative URLs for internal posts (e.g., "Ask HN:")
            if not article_url.startswith('http'):
                article_url = urllib.parse.urljoin(url, article_url)

            site_span = title_span.find('span', class_='sitebit')
            site = f"({site_span.text})" if site_span else ""

            # --- Metadata (points, user, comments) ---
            # Metadata is in the next 'tr' sibling
            metadata_row = story_row.find_next_sibling('tr')
            if not metadata_row:
                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
                continue

            subtext = metadata_row.find('td', class_='subtext')
            if not subtext:
                output_md.append(f"{rank} **[{title}]({article_url})** {site}\n")
                continue

            score = subtext.find('span', class_='score')
            user = subtext.find('a', class_='hnuser')
            
            # The comments link is usually the last link in the subtext
            comments_link = subtext.find_all('a')[-1]
            
            # Build metadata string
            meta_parts = []
            if score:
                meta_parts.append(score.text)
            if user:
                meta_parts.append(f"by {user.text}")
            if comments_link and 'item?id=' in comments_link.get('href', ''):
                comments_text = comments_link.text.replace('\xa0', ' ') # Handle non-breaking space
                comments_url = urllib.parse.urljoin(url, comments_link['href'])
                meta_parts.append(f"[{comments_text}]({comments_url})")
            
            metadata_str = " | ".join(meta_parts)

            # Assemble the final markdown for the item
            output_md.append(f"{rank} **[{title}]({article_url})** {site}")
            if metadata_str:
                output_md.append(f"   - *{metadata_str}*\n")

        return "\n".join(output_md)

    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"
    except Exception as e:
        return f"An unexpected error occurred during parsing: {e}"

# Define the Gradio interface
demo = gr.Interface(
    fn=fetch_and_parse_hn,
    inputs=gr.Textbox(
        label="Hacker News URL",
        placeholder="e.g., https://news.ycombinator.com",
        value="https://news.ycombinator.com"
    ),
    outputs=gr.Markdown(label="Hacker News Digest"),
    title="Hacker News Digest Fetcher",
    description="Enter a Hacker News URL (like the front page, 'new', or 'ask') to get a clean, readable digest. You can click on the story titles to go to the articles and on the comment links to see the discussions.",
    allow_flagging="never",
    theme="Nymbo/Nymbo_Theme",
    examples=[["https://news.ycombinator.com"], ["https://news.ycombinator.com/news?p=2"], ["https://news.ycombinator.com/ask"]]
)

if __name__ == "__main__":
    demo.launch()