File size: 3,023 Bytes
e66e891
 
0000c5e
 
6c67813
e97f932
f8a041b
0000c5e
 
 
687d26a
4c58071
0000c5e
 
 
 
 
 
 
 
 
 
e66e891
00764df
dc68696
687d26a
 
 
dc68696
8863982
e66e891
687d26a
f8a041b
e66e891
687d26a
 
8863982
 
6c67813
 
 
00764df
 
dc68696
 
 
6c67813
dc68696
 
 
6c67813
dc68696
 
 
6c67813
0000c5e
dc68696
6c67813
dc68696
0000c5e
 
cade3d1
0000c5e
dc68696
 
 
 
0000c5e
dc68696
cade3d1
f8a041b
dc68696
4c58071
6c67813
4c58071
e97f932
0000c5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cade3d1
0000c5e
cade3d1
 
0000c5e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
'''Tool functions for MCP server'''

import os
import threading
import time
import json
import logging
import queue
from upstash_vector import Index, Vector

import functions.feed_extraction as extraction_funcs
import functions.summarization as summarization_funcs
import functions.rag as rag_funcs

RAG_INGEST_QUEUE = queue.Queue()

rag_ingest_thread = threading.Thread(
    target=rag_funcs.ingest,
    args=(RAG_INGEST_QUEUE,)
)

rag_ingest_thread.start()


def get_feed(website: str) -> list:
    '''Gets RSS feed content from a given website. Can take a website or RSS
    feed URL directly, or the name of a website. Will attempt to find RSS
    feed and return title, summary and link to full article for most recent
    items in feed.
    
    Args:
        website: URL or name of website to extract RSS feed content from

    Returns:
        JSON string containing the feed content or 'No feed found' if a RSS
        feed for the requested website could not be found
    '''

    start_time = time.time()

    logger = logging.getLogger(__name__ + '.get_feed()')
    logger.info('Getting feed content for: %s', website)

    # Find the feed's URI from the website name/URL
    feed_uri = extraction_funcs.find_feed_uri(website)
    logger.info('find_feed_uri() returned %s', feed_uri)

    if 'No feed found' in feed_uri:
        logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
        return 'No feed found'

    # Parse and extract content from the feed
    content = extraction_funcs.parse_feed(feed_uri)
    logger.info('parse_feed() returned %s entries', len(list(content.keys())))

    # Summarize each post in the feed and submit full text for RAG ingest
    for i, item in content.items():

        if item['content'] is not None:

            RAG_INGEST_QUEUE.put(item)
            logger.info('"%s" sent to RAG ingest', item['title'])

            summary = summarization_funcs.summarize_content(
                item['title'],
                item['content']
            )

            content[i]['summary'] = summary
            logger.info('Summary of "%s" generated', item['title'])

        content[i].pop('content', None)

    logger.info('Completed in %s seconds', round(time.time()-start_time, 2))

    return json.dumps(content)


def context_search(query: str, article_title: str = None) -> str:
    '''Searches for context relevant to query in article vector store.
    
    Ags:
        query: user query to find context for
        article_title: optional, use this argument to search only for context
            from a specific context
            
    Returns:
        Context which bests matches query as string.
    '''

    index = Index(
        url='https://living-whale-89944-us1-vector.upstash.io',
        token=os.environ['UPSTASH_VECTOR_KEY']
    )

    results = None

    results = index.query(
        data=query,
        top_k=3,
        include_metadata=True,
        include_data=True,
        namespace=article_title
    )

    return results