Spaces:
Runtime error
Runtime error
'''Collection of function for RAG on article texts.''' | |
import os | |
import logging | |
import queue | |
from semantic_text_splitter import TextSplitter | |
from tokenizers import Tokenizer | |
from upstash_vector import Index | |
def ingest(rag_ingest_queue: queue.Queue) -> None: | |
'''Semantically chunks article and upsert to Upstash vector db | |
using article title as namespace.''' | |
logger = logging.getLogger(__name__ + '.ingest()') | |
index = Index( | |
url='https://living-whale-89944-us1-vector.upstash.io', | |
token=os.environ['UPSTASH_VECTOR_KEY'] | |
) | |
while True: | |
namespaces = index.list_namespaces() | |
item = rag_ingest_queue.get() | |
logger.info('Upserting "%s": %s', item['title'], item) | |
title = item['title'] | |
if title not in namespaces: | |
text = item['content'] | |
logger.info('Got "%s" from RAG ingest queue', title) | |
tokenizer=Tokenizer.from_pretrained('bert-base-uncased') | |
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256) | |
chunks=splitter.chunks(text) | |
for i, chunk in enumerate(chunks): | |
index.upsert( | |
[ | |
( | |
hash(f'{title}-{i}'), | |
chunk, | |
{'namespace': title} | |
) | |
], | |
) | |
logger.info('Ingested %s chunks into vector DB', i + 1) | |
else: | |
logger.info('%s already in RAG namespace', title) | |