Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
import os | |
VECTOR_STORE_PATH = "vector_store_data" | |
def save_vector_store(vector_store): | |
"""Simpan vector store ke file.""" | |
vector_store.save_local(VECTOR_STORE_PATH) | |
print(f"Vector store saved to {VECTOR_STORE_PATH}") | |
def load_vector_store(): | |
"""Muat vector store dari file, atau return None kalau file tidak ada.""" | |
if os.path.exists(VECTOR_STORE_PATH): | |
embeddings = HuggingFaceEmbeddings( | |
model_name="LazarusNLP/all-indo-e5-small-v4", | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs={"normalize_embeddings": True} | |
) | |
vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True) | |
print(f"Vector store loaded from {VECTOR_STORE_PATH}") | |
return vector_store | |
else: | |
print("Vector store file not found.") | |
return None | |
def process_documents(docs): | |
embeddings = HuggingFaceEmbeddings( | |
model_name="LazarusNLP/all-indo-e5-small-v4", | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs={"normalize_embeddings": True} | |
) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1500, | |
chunk_overlap=300 | |
) | |
text_chunks = text_splitter.split_documents(docs) | |
vector_store = FAISS.from_documents(text_chunks, embeddings) | |
return vector_store | |