pnp-chatbot-v1 / app /document_processor.py
FauziIsyrinApridal
update pengecekan doc kosong
9fd98bf
raw
history blame
1.52 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os
VECTOR_STORE_PATH = "vector_store_data"
def save_vector_store(vector_store):
"""Simpan vector store ke file."""
vector_store.save_local(VECTOR_STORE_PATH)
print(f"Vector store saved to {VECTOR_STORE_PATH}")
def load_vector_store():
"""Muat vector store dari file, atau return None kalau file tidak ada."""
if os.path.exists(VECTOR_STORE_PATH):
embeddings = HuggingFaceEmbeddings(
model_name="LazarusNLP/all-indo-e5-small-v4",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
)
vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
print(f"Vector store loaded from {VECTOR_STORE_PATH}")
return vector_store
else:
print("Vector store file not found.")
return None
def process_documents(docs):
embeddings = HuggingFaceEmbeddings(
model_name="LazarusNLP/all-indo-e5-small-v4",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=300
)
text_chunks = text_splitter.split_documents(docs)
vector_store = FAISS.from_documents(text_chunks, embeddings)
return vector_store