Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Running

App Files Files Community

gperdrizet commited on 2 days ago

Commit

4c58071

unverified ·

1 Parent(s): 4a36ecf

Finished feed content summarization.

Browse files

Files changed (4) hide show

functions/feed_extraction.py +15 -11
functions/summarization.py +74 -0
functions/tools.py +9 -0
requirements.txt +0 -1

functions/feed_extraction.py CHANGED Viewed

@@ -92,28 +92,29 @@ def parse_feed(feed_uri: str) -> list:
             entry_content['title'] = entry.title
             entry_content['link'] = entry.link
-            entry_content['updated'] = None
-            entry_content['summary'] = None
             entry_content['content'] = None
-            if 'updated' in entry:
-                entry_content['updated'] = entry.updated
-            if 'summary' in entry:
-                summary = _get_text(entry.summary)
-                entry_content['summary'] = summary
             if 'content' in entry:
                 entry_content['content'] = entry.content
-            html = _get_html(entry_content['link'])
-            content = _get_text(html)
-            entry_content['extracted_content'] = content
         entries[i] = entry_content
-        if i == 9:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))
@@ -262,6 +263,9 @@ def _clean_html(html: str) -> str:
         Cleaned string
     '''
     # First we remove inline JavaScript/CSS:
     cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())

             entry_content['title'] = entry.title
             entry_content['link'] = entry.link
+            # entry_content['updated'] = None
+            # entry_content['summary'] = None
             entry_content['content'] = None
+            # if 'updated' in entry:
+            #     entry_content['updated'] = entry.updated
+            # if 'summary' in entry:
+            #     summary = _get_text(entry.summary)
+            #     entry_content['summary'] = summary
             if 'content' in entry:
                 entry_content['content'] = entry.content
+            if entry_content['content'] is None:
+                html = _get_html(entry_content['link'])
+                content = _get_text(html)
+                entry_content['content'] = content
         entries[i] = entry_content
+        if i == 2:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))
         Cleaned string
     '''
+    if html is None:
+        return None
     # First we remove inline JavaScript/CSS:
     cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())

functions/summarization.py ADDED Viewed

	@@ -0,0 +1,74 @@

+'''Functions to summarize article content.'''
+import os
+import logging
+from openai import OpenAI
+def summarize_content(content: str) -> str:
+    '''Generates summary of article content using Modal inference endpoint.
+    Args:
+        content: string containing the text content to be summarized
+    Returns:
+        Summarized text as string
+    '''
+    logger = logging.getLogger(__name__ + '.summarize_content')
+    logger.info('Summarizing extracted content')
+    client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
+    client.base_url = (
+        'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
+    )
+    # Default to first avalible model
+    model = client.models.list().data[0]
+    model_id = model.id
+    # messages = [
+    #     {
+    #         'role': 'system',
+    #         'content': ('You are a research assistant, skilled in summarizing documents in just '+
+    #             'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
+    #         'role': 'user',
+    #         'content': content
+    #     }
+    # ]
+    messages = [
+        {
+            'role': 'system',
+            'content': f'Summarize the following text in 50 words returning only the summary: {content}'
+        }
+    ]
+    completion_args = {
+        'model': model_id,
+        'messages': messages,
+        # "frequency_penalty": args.frequency_penalty,
+        # "max_tokens": 128,
+        # "n": args.n,
+        # "presence_penalty": args.presence_penalty,
+        # "seed": args.seed,
+        # "stop": args.stop,
+        # "stream": args.stream,
+        # "temperature": args.temperature,
+        # "top_p": args.top_p,
+    }
+    try:
+        response = client.chat.completions.create(**completion_args)
+    except Exception as e: # pylint: disable=broad-exception-caught
+        response = None
+        logger.error('Error during Modal API call: %s', e)
+    if response is not None:
+        return response.choices[0].message.content
+    else:
+        return None

functions/tools.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import json
 import logging
 import functions.feed_extraction as extraction_funcs
 def get_feed(website: str) -> list:
@@ -31,4 +32,12 @@ def get_feed(website: str) -> list:
     content = extraction_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
     return json.dumps(content)

 import json
 import logging
 import functions.feed_extraction as extraction_funcs
+import functions.summarization as summarization_funcs
 def get_feed(website: str) -> list:
     content = extraction_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
+    for i, item in content.items():
+        if item['content'] is not None:
+            summary = summarization_funcs.summarize_content(item['content'])
+            content[i]['summary'] = summary
+        content[i].pop('content', None)
     return json.dumps(content)

requirements.txt CHANGED Viewed

@@ -4,5 +4,4 @@ findfeed
 googlesearch-python
 gradio
 mcp
-#modal
 openai

 googlesearch-python
 gradio
 mcp
 openai