File size: 1,524 Bytes
ea1ba01
35325e7
ea1ba01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fd98bf
ea1ba01
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os

VECTOR_STORE_PATH = "vector_store_data"

def save_vector_store(vector_store):
    """Simpan vector store ke file."""
    vector_store.save_local(VECTOR_STORE_PATH)
    print(f"Vector store saved to {VECTOR_STORE_PATH}")

def load_vector_store():
    """Muat vector store dari file, atau return None kalau file tidak ada."""
    if os.path.exists(VECTOR_STORE_PATH):
        embeddings = HuggingFaceEmbeddings(
            model_name="LazarusNLP/all-indo-e5-small-v4",
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True}
        )
        vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
        print(f"Vector store loaded from {VECTOR_STORE_PATH}")
        return vector_store
    else:
        print("Vector store file not found.")
        return None


def process_documents(docs):
    embeddings = HuggingFaceEmbeddings(
        model_name="LazarusNLP/all-indo-e5-small-v4",
        model_kwargs={"device": "cpu"}, 
        encode_kwargs={"normalize_embeddings": True}
    )

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=300
    )
    text_chunks = text_splitter.split_documents(docs)
    vector_store = FAISS.from_documents(text_chunks, embeddings)
    
    return vector_store