Spaces:
Sleeping
Sleeping
# src/vectorstores/chroma_vectorstore.py | |
import chromadb | |
from typing import List, Callable, Any | |
from .base_vectorstore import BaseVectorStore | |
class ChromaVectorStore(BaseVectorStore): | |
def __init__( | |
self, | |
embedding_function: Callable[[List[str]], List[List[float]]], | |
persist_directory: str = './chroma_db' | |
): | |
""" | |
Initialize Chroma Vector Store | |
Args: | |
embedding_function (Callable): Function to generate embeddings | |
persist_directory (str): Directory to persist the vector store | |
""" | |
self.client = chromadb.PersistentClient(path=persist_directory) | |
self.collection = self.client.get_or_create_collection(name="documents") | |
self.embedding_function = embedding_function | |
def add_documents( | |
self, | |
documents: List[str], | |
embeddings: List[List[float]] = None | |
) -> None: | |
""" | |
Add documents to the vector store | |
Args: | |
documents (List[str]): List of document texts | |
embeddings (List[List[float]], optional): Pre-computed embeddings | |
""" | |
if not embeddings: | |
embeddings = self.embedding_function(documents) | |
# Generate unique IDs | |
ids = [f"doc_{i}" for i in range(len(documents))] | |
self.collection.add( | |
documents=documents, | |
embeddings=embeddings, | |
ids=ids | |
) | |
def similarity_search( | |
self, | |
query_embedding: List[float], | |
top_k: int = 3 | |
) -> List[str]: | |
""" | |
Perform similarity search | |
Args: | |
query_embedding (List[float]): Embedding of the query | |
top_k (int): Number of top similar documents to retrieve | |
Returns: | |
List[str]: List of most similar documents | |
""" | |
results = self.collection.query( | |
query_embeddings=[query_embedding], | |
n_results=top_k | |
) | |
return results.get('documents', [[]])[0] |