gperdrizet commited on
Commit
4c58071
·
unverified ·
1 Parent(s): 4a36ecf

Finished feed content summarization.

Browse files
functions/feed_extraction.py CHANGED
@@ -92,28 +92,29 @@ def parse_feed(feed_uri: str) -> list:
92
  entry_content['title'] = entry.title
93
  entry_content['link'] = entry.link
94
 
95
- entry_content['updated'] = None
96
- entry_content['summary'] = None
97
  entry_content['content'] = None
98
 
99
- if 'updated' in entry:
100
- entry_content['updated'] = entry.updated
101
 
102
- if 'summary' in entry:
103
- summary = _get_text(entry.summary)
104
- entry_content['summary'] = summary
105
 
106
  if 'content' in entry:
107
  entry_content['content'] = entry.content
108
 
109
- html = _get_html(entry_content['link'])
110
- content = _get_text(html)
111
 
112
- entry_content['extracted_content'] = content
 
 
113
 
114
  entries[i] = entry_content
115
 
116
- if i == 9:
117
  break
118
 
119
  logger.info('Entries contains %s elements', len(list(entries.keys())))
@@ -262,6 +263,9 @@ def _clean_html(html: str) -> str:
262
  Cleaned string
263
  '''
264
 
 
 
 
265
  # First we remove inline JavaScript/CSS:
266
  cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
267
 
 
92
  entry_content['title'] = entry.title
93
  entry_content['link'] = entry.link
94
 
95
+ # entry_content['updated'] = None
96
+ # entry_content['summary'] = None
97
  entry_content['content'] = None
98
 
99
+ # if 'updated' in entry:
100
+ # entry_content['updated'] = entry.updated
101
 
102
+ # if 'summary' in entry:
103
+ # summary = _get_text(entry.summary)
104
+ # entry_content['summary'] = summary
105
 
106
  if 'content' in entry:
107
  entry_content['content'] = entry.content
108
 
109
+ if entry_content['content'] is None:
 
110
 
111
+ html = _get_html(entry_content['link'])
112
+ content = _get_text(html)
113
+ entry_content['content'] = content
114
 
115
  entries[i] = entry_content
116
 
117
+ if i == 2:
118
  break
119
 
120
  logger.info('Entries contains %s elements', len(list(entries.keys())))
 
263
  Cleaned string
264
  '''
265
 
266
+ if html is None:
267
+ return None
268
+
269
  # First we remove inline JavaScript/CSS:
270
  cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
271
 
functions/summarization.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Functions to summarize article content.'''
2
+
3
+ import os
4
+ import logging
5
+
6
+ from openai import OpenAI
7
+
8
+
9
+ def summarize_content(content: str) -> str:
10
+ '''Generates summary of article content using Modal inference endpoint.
11
+
12
+ Args:
13
+ content: string containing the text content to be summarized
14
+
15
+ Returns:
16
+ Summarized text as string
17
+ '''
18
+
19
+ logger = logging.getLogger(__name__ + '.summarize_content')
20
+ logger.info('Summarizing extracted content')
21
+
22
+ client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
23
+
24
+ client.base_url = (
25
+ 'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
26
+ )
27
+
28
+ # Default to first avalible model
29
+ model = client.models.list().data[0]
30
+ model_id = model.id
31
+
32
+ # messages = [
33
+ # {
34
+ # 'role': 'system',
35
+ # 'content': ('You are a research assistant, skilled in summarizing documents in just '+
36
+ # 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
37
+ # 'role': 'user',
38
+ # 'content': content
39
+ # }
40
+ # ]
41
+
42
+ messages = [
43
+ {
44
+ 'role': 'system',
45
+ 'content': f'Summarize the following text in 50 words returning only the summary: {content}'
46
+ }
47
+ ]
48
+
49
+ completion_args = {
50
+ 'model': model_id,
51
+ 'messages': messages,
52
+ # "frequency_penalty": args.frequency_penalty,
53
+ # "max_tokens": 128,
54
+ # "n": args.n,
55
+ # "presence_penalty": args.presence_penalty,
56
+ # "seed": args.seed,
57
+ # "stop": args.stop,
58
+ # "stream": args.stream,
59
+ # "temperature": args.temperature,
60
+ # "top_p": args.top_p,
61
+ }
62
+
63
+ try:
64
+ response = client.chat.completions.create(**completion_args)
65
+
66
+ except Exception as e: # pylint: disable=broad-exception-caught
67
+ response = None
68
+ logger.error('Error during Modal API call: %s', e)
69
+
70
+ if response is not None:
71
+ return response.choices[0].message.content
72
+
73
+ else:
74
+ return None
functions/tools.py CHANGED
@@ -3,6 +3,7 @@
3
  import json
4
  import logging
5
  import functions.feed_extraction as extraction_funcs
 
6
 
7
 
8
  def get_feed(website: str) -> list:
@@ -31,4 +32,12 @@ def get_feed(website: str) -> list:
31
  content = extraction_funcs.parse_feed(feed_uri)
32
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
33
 
 
 
 
 
 
 
 
 
34
  return json.dumps(content)
 
3
  import json
4
  import logging
5
  import functions.feed_extraction as extraction_funcs
6
+ import functions.summarization as summarization_funcs
7
 
8
 
9
  def get_feed(website: str) -> list:
 
32
  content = extraction_funcs.parse_feed(feed_uri)
33
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
34
 
35
+ for i, item in content.items():
36
+
37
+ if item['content'] is not None:
38
+ summary = summarization_funcs.summarize_content(item['content'])
39
+ content[i]['summary'] = summary
40
+
41
+ content[i].pop('content', None)
42
+
43
  return json.dumps(content)
requirements.txt CHANGED
@@ -4,5 +4,4 @@ findfeed
4
  googlesearch-python
5
  gradio
6
  mcp
7
- #modal
8
  openai
 
4
  googlesearch-python
5
  gradio
6
  mcp
 
7
  openai