lvwerra HF Staff commited on
Commit
b6cc122
·
verified ·
1 Parent(s): 792ea9a

Create run.py

Browse files
Files changed (1) hide show
  1. run.py +205 -0
run.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AI News Summarizer
4
+
5
+ A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
6
+ """
7
+
8
+ import argparse
9
+ from huggingface_hub import HfApi, InferenceClient
10
+ from newspaper import Article
11
+ import pandas as pd
12
+ import requests
13
+ from datetime import date, timedelta
14
+ import json
15
+ import os
16
+ from tqdm.auto import tqdm
17
+
18
+ def parse_arguments():
19
+ """Parse command line arguments"""
20
+ parser = argparse.ArgumentParser(description='AI News Summarizer')
21
+ parser.add_argument('--topic', type=str, default="Language Models",
22
+ help='Topic to search for news articles (default: "Language Models")')
23
+ parser.add_argument('--num-articles', type=int, default=50,
24
+ help='Number of articles to fetch (default: 50)')
25
+ parser.add_argument('--provider', type=str, default="fireworks-ai",
26
+ help='Inference provider for HuggingFace (default: "fireworks-ai")')
27
+ parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
28
+ help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')
29
+
30
+ args = parser.parse_args()
31
+ return args
32
+
33
+ def main():
34
+ # Parse arguments
35
+ args = parse_arguments()
36
+
37
+ # Environment variables
38
+ NEWS_API_KEY = os.getenv("NEWS_API_KEY")
39
+ HF_API_KEY = os.getenv("HF_API_KEY")
40
+ NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
41
+ MODEL = "Qwen/Qwen3-30B-A3B"
42
+
43
+ # Initialize clients
44
+ client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)
45
+
46
+ # Set topic and number of articles
47
+ topic = args.topic
48
+ num = args.num_articles
49
+
50
+ # Configure tqdm for pandas
51
+ tqdm.pandas(desc="")
52
+
53
+ print(f"Fetching top {num} articles on '{topic}' of today...")
54
+ articles = fetch_news_articles(topic, num)
55
+ df = pd.DataFrame.from_records(articles)
56
+
57
+ print(f"Downloading and parsing {len(df)} articles...")
58
+ df["content_full"] = df["url"].progress_apply(fetch_full_article)
59
+ mask = df['content_full'].str.contains("Failed to fetch artcile.")
60
+ df.loc[mask, 'content_full'] = df.loc[mask, 'content']
61
+
62
+ print(f"Summarizing each article (total={len(df)})...")
63
+ df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
64
+ df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())
65
+
66
+ print(f"Create report...")
67
+ df["article_summary"] = df.apply(format_summary, axis=1)
68
+
69
+ sep = "\n" + "="*80 + "\n"
70
+ overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
71
+ report = create_report(overview, client, MODEL)
72
+
73
+ # Extract report content
74
+ final_report = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
75
+
76
+ file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
77
+ print(f"Uploading to {args.repo_id} under {filepath}...")
78
+ # Upload to HuggingFace
79
+ hf_api = HfApi()
80
+ hf_api.upload_file(
81
+ path_or_fileobj=final_report.encode("utf-8"),
82
+ path_in_repo=file_path,
83
+ repo_id=args.repo_id,
84
+ repo_type="space",
85
+ )
86
+
87
+ print("Job finished!")
88
+
89
+ def fetch_news_articles(topic, num_articles=10):
90
+ """Fetch news articles on the given topic"""
91
+ NEWS_API_KEY = os.getenv("NEWS_API_KEY")
92
+ NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
93
+
94
+ today = date.today().strftime('%Y-%m-%d')
95
+ yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
96
+
97
+ params = {
98
+ 'q': topic,
99
+ 'from': yesterday,
100
+ 'to': today,
101
+ 'sortBy': 'popularity',
102
+ 'language': 'en',
103
+ 'pageSize': num_articles,
104
+ 'apiKey': NEWS_API_KEY
105
+ }
106
+
107
+ response = requests.get(NEWS_ENDPOINT, params=params)
108
+
109
+ if response.status_code == 200:
110
+ data = response.json()
111
+ return data['articles']
112
+ else:
113
+ print(f"Error: {response.status_code}")
114
+ print(response.text)
115
+ return []
116
+
117
+ def fetch_full_article(url):
118
+ """Fetch and parse the full content of an article"""
119
+ try:
120
+ a = Article(url)
121
+ a.download()
122
+ a.parse()
123
+ return a.text
124
+ except:
125
+ return "Failed to fetch artcile."
126
+
127
+ def summarize(article, client, model):
128
+ """Summarize an article using the HuggingFace inference API"""
129
+ user_msg = f"""\
130
+ Summarize the following news article in a few bullet points. \
131
+ Note that the reader is an expert in the field and wants only the most relevant and novel information.
132
+
133
+ Article:
134
+ {article}
135
+
136
+ /no_think"""
137
+
138
+ messages=[
139
+ {
140
+ "role": "user",
141
+ "content": user_msg,
142
+ }
143
+ ]
144
+
145
+ response = client.chat_completion(
146
+ model=model,
147
+ messages=messages,
148
+ temperature=0.8,
149
+ max_tokens=512,
150
+ )
151
+
152
+ return response.choices[0].message.content
153
+
154
+ def format_summary(row):
155
+ """Format article summary"""
156
+ summary = f"""\
157
+ Title: {row['title']}
158
+ Published: {row['publishedAt']}
159
+ Description: {row['description']}
160
+ URL: {row['url']}
161
+ Summary:\n{row['summary_clean']}"""
162
+ return summary
163
+
164
+ def create_report(articles_overview, client, model):
165
+ """Create a comprehensive report from all article summaries"""
166
+ user_msg = f"""\
167
+ Create a summary report of the following newspaper articles.
168
+
169
+ Separete the report into these categories:
170
+ - Breaking news: anything that can also appear below but is the most important news of the day
171
+ - Model news (e.g. new model releases, or insights about existing models etc.)
172
+ - Startups (e.g. new startups, fundraising etc.)
173
+ - Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
174
+ - Policy (e.g. US administration or EU policy)
175
+ - Products (e.g. news of products that are powered by AI in some way)
176
+ - Miscellaneous (whatever doesn't fit into the others)
177
+
178
+ Style: The reader is an expert in the field and wants only the most relevant and novel information. \
179
+ Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \
180
+ Start the report with a summary of how many articles you processed and which time window.
181
+
182
+ Format: Use markdown formatting and add links at the end of each section linking to the original articles.
183
+
184
+ Articles:\
185
+ {articles_overview}
186
+ """
187
+
188
+ messages=[
189
+ {
190
+ "role": "user",
191
+ "content": user_msg,
192
+ }
193
+ ]
194
+
195
+ response = client.chat_completion(
196
+ model=model,
197
+ messages=messages,
198
+ temperature=0.8,
199
+ max_tokens=32000,
200
+ )
201
+
202
+ return response.choices[0].message.content
203
+
204
+ if __name__ == "__main__":
205
+ main()