Spaces:
Sleeping
Sleeping
import faiss | |
import json | |
import numpy as np | |
from pathlib import Path | |
from src.utils.config import VECTOR_DB_PATH, EMBEDDING_DIM | |
from typing import List | |
class VectorStore: | |
""" | |
Wrapper for FAISS vector storage, with ID-to-text mapping. | |
""" | |
def __init__(self, index_path: Path = VECTOR_DB_PATH): | |
self.index_path = index_path.with_suffix(".index") | |
self.meta_path = index_path.with_suffix(".json") | |
self.index = faiss.IndexFlatL2(EMBEDDING_DIM) | |
self.metadata = [] # list of dicts: {"id": str, "text": str} | |
# Try loading if exists | |
if self.index_path.exists() and self.meta_path.exists(): | |
try: | |
self.load() | |
except Exception as e: | |
print(f"[WARN] Failed to load vector store: {e}") | |
# Reinitialize clean if corrupted | |
self.index = faiss.IndexFlatL2(EMBEDDING_DIM) | |
self.metadata = [] | |
def add(self, embeddings: list[list[float]], metadata: List[dict]): | |
""" | |
Add new embeddings and their metadata (e.g., {"id": "doc1_chunk0", "text": "..."}) | |
""" | |
self.index.add(np.array(embeddings).astype("float32")) | |
self.metadata.extend(metadata) | |
self.save() | |
def search(self, query_embedding: list[float], top_k: int = 5) -> List[dict]: | |
""" | |
Perform vector search and return metadata of top_k results. | |
""" | |
D, I = self.index.search(np.array([query_embedding]).astype("float32"), top_k) | |
return [self.metadata[i] for i in I[0]] | |
def save(self) -> None: | |
""" | |
Save data to an external file. | |
""" | |
self.index_path.parent.mkdir(parents = True, exist_ok = True) | |
faiss.write_index(self.index, str(self.index_path)) | |
with open(self.meta_path, 'w', encoding = "utf-8") as f: | |
json.dump(self.metadata, f, ensure_ascii = False, indent = 2) | |
def load(self) -> None: | |
""" | |
Load data from an external file. | |
""" | |
self.index = faiss.read_index(str(self.index_path)) | |
with open(self.meta_path, 'r', encoding = "utf-8") as f: | |
self.metadata = json.load(f) | |