Spaces:

lvwerra
/

ai-news

Runtime error

App Files Files Community

lvwerra HF Staff commited on May 1

Commit

b6cc122

verified ·

1 Parent(s): 792ea9a

Create run.py

Browse files

Files changed (1) hide show

run.py +205 -0

run.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python3
+"""
+AI News Summarizer
+A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
+"""
+import argparse
+from huggingface_hub import HfApi, InferenceClient
+from newspaper import Article
+import pandas as pd
+import requests
+from datetime import date, timedelta
+import json
+import os
+from tqdm.auto import tqdm
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description='AI News Summarizer')
+    parser.add_argument('--topic', type=str, default="Language Models",
+                        help='Topic to search for news articles (default: "Language Models")')
+    parser.add_argument('--num-articles', type=int, default=50,
+                        help='Number of articles to fetch (default: 50)')
+    parser.add_argument('--provider', type=str, default="fireworks-ai",
+                        help='Inference provider for HuggingFace (default: "fireworks-ai")')
+    parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
+                        help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')
+    args = parser.parse_args()
+    return args
+def main():
+    # Parse arguments
+    args = parse_arguments()
+    # Environment variables
+    NEWS_API_KEY = os.getenv("NEWS_API_KEY")
+    HF_API_KEY = os.getenv("HF_API_KEY")
+    NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
+    MODEL = "Qwen/Qwen3-30B-A3B"
+    # Initialize clients
+    client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)
+    # Set topic and number of articles
+    topic = args.topic
+    num = args.num_articles
+    # Configure tqdm for pandas
+    tqdm.pandas(desc="")
+    print(f"Fetching top {num} articles on '{topic}' of today...")
+    articles = fetch_news_articles(topic, num)
+    df = pd.DataFrame.from_records(articles)
+    print(f"Downloading and parsing {len(df)} articles...")
+    df["content_full"] = df["url"].progress_apply(fetch_full_article)
+    mask = df['content_full'].str.contains("Failed to fetch artcile.")
+    df.loc[mask, 'content_full'] = df.loc[mask, 'content']
+    print(f"Summarizing each article (total={len(df)})...")
+    df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
+    df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())
+    print(f"Create report...")
+    df["article_summary"] = df.apply(format_summary, axis=1)
+    sep = "\n" + "="*80 + "\n"
+    overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
+    report = create_report(overview, client, MODEL)
+    # Extract report content
+    final_report = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
+    file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
+    print(f"Uploading to {args.repo_id} under {filepath}...")
+    # Upload to HuggingFace
+    hf_api = HfApi()
+    hf_api.upload_file(
+        path_or_fileobj=final_report.encode("utf-8"),
+        path_in_repo=file_path,
+        repo_id=args.repo_id,
+        repo_type="space",
+    )
+    print("Job finished!")
+def fetch_news_articles(topic, num_articles=10):
+    """Fetch news articles on the given topic"""
+    NEWS_API_KEY = os.getenv("NEWS_API_KEY")
+    NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
+    today = date.today().strftime('%Y-%m-%d')
+    yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
+    params = {
+        'q': topic,
+        'from': yesterday,
+        'to': today,
+        'sortBy': 'popularity',
+        'language': 'en',
+        'pageSize': num_articles,
+        'apiKey': NEWS_API_KEY
+    }
+    response = requests.get(NEWS_ENDPOINT, params=params)
+    if response.status_code == 200:
+        data = response.json()
+        return data['articles']
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return []
+def fetch_full_article(url):
+    """Fetch and parse the full content of an article"""
+    try:
+        a = Article(url)
+        a.download()
+        a.parse()
+        return a.text
+    except:
+        return "Failed to fetch artcile."
+def summarize(article, client, model):
+    """Summarize an article using the HuggingFace inference API"""
+    user_msg = f"""\
+Summarize the following news article in a few bullet points. \
+Note that the reader is an expert in the field and wants only the most relevant and novel information.
+Article:
+{article}
+/no_think"""
+    messages=[
+            {
+                "role": "user",
+                "content": user_msg,
+            }
+        ]
+    response = client.chat_completion(
+            model=model,
+            messages=messages,
+            temperature=0.8,
+            max_tokens=512,
+        )
+    return response.choices[0].message.content
+def format_summary(row):
+    """Format article summary"""
+    summary = f"""\
+Title: {row['title']}
+Published: {row['publishedAt']}
+Description: {row['description']}
+URL: {row['url']}
+Summary:\n{row['summary_clean']}"""
+    return summary
+def create_report(articles_overview, client, model):
+    """Create a comprehensive report from all article summaries"""
+    user_msg = f"""\
+Create a summary report of the following newspaper articles.
+Separete the report into these categories:
+- Breaking news: anything that can also appear below but is the most important news of the day
+- Model news (e.g. new model releases, or insights about existing models etc.)
+- Startups (e.g. new startups, fundraising etc.)
+- Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
+- Policy (e.g. US administration or EU policy)
+- Products (e.g. news of products that are powered by AI in some way)
+- Miscellaneous (whatever doesn't fit into the others)
+Style: The reader is an expert in the field and wants only the most relevant and novel information. \
+Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \
+Start the report with a summary of how many articles you processed and which time window.
+Format: Use markdown formatting and add links at the end of each section linking to the original articles.
+Articles:\
+{articles_overview}
+"""
+    messages=[
+        {
+            "role": "user",
+            "content": user_msg,
+        }
+    ]
+    response = client.chat_completion(
+        model=model,
+        messages=messages,
+        temperature=0.8,
+        max_tokens=32000,
+    )
+    return response.choices[0].message.content
+if __name__ == "__main__":
+    main()