Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
import os | |
import tempfile | |
import zipfile | |
import streamlit as st | |
def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"): | |
"""Save vector store to Supabase storage as separate files.""" | |
try: | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Save vector store locally first | |
local_path = os.path.join(temp_dir, "vector_store") | |
vector_store.save_local(local_path) | |
# Upload index.faiss | |
faiss_file = os.path.join(local_path, "index.faiss") | |
if os.path.exists(faiss_file): | |
with open(faiss_file, 'rb') as f: | |
supabase.storage.from_(bucket_name).upload( | |
f"{file_prefix}_index.faiss", | |
f, | |
{"upsert": "true"} | |
) | |
print(f"Uploaded: {file_prefix}_index.faiss") | |
# Upload index.pkl | |
pkl_file = os.path.join(local_path, "index.pkl") | |
if os.path.exists(pkl_file): | |
with open(pkl_file, 'rb') as f: | |
supabase.storage.from_(bucket_name).upload( | |
f"{file_prefix}_index.pkl", | |
f, | |
{"upsert": "true"} | |
) | |
print(f"Uploaded: {file_prefix}_index.pkl") | |
print(f"Vector store uploaded to Supabase bucket: {bucket_name}") | |
return True | |
except Exception as e: | |
print(f"Error uploading vector store to Supabase: {e}") | |
st.error(f"Error uploading to Supabase: {e}") | |
return False | |
def load_vector_store_from_supabase(supabase, bucket_name, file_prefix="vector_store"): | |
"""Load vector store from Supabase storage from separate files.""" | |
try: | |
with tempfile.TemporaryDirectory() as temp_dir: | |
local_path = os.path.join(temp_dir, "vector_store") | |
os.makedirs(local_path, exist_ok=True) | |
# Download index.faiss | |
try: | |
faiss_response = supabase.storage.from_(bucket_name).download(f"{file_prefix}_index.faiss") | |
faiss_file = os.path.join(local_path, "index.faiss") | |
with open(faiss_file, 'wb') as f: | |
f.write(faiss_response) | |
print(f"Downloaded: {file_prefix}_index.faiss") | |
except Exception as e: | |
print(f"Error downloading index.faiss: {e}") | |
return None | |
# Download index.pkl | |
try: | |
pkl_response = supabase.storage.from_(bucket_name).download(f"{file_prefix}_index.pkl") | |
pkl_file = os.path.join(local_path, "index.pkl") | |
with open(pkl_file, 'wb') as f: | |
f.write(pkl_response) | |
print(f"Downloaded: {file_prefix}_index.pkl") | |
except Exception as e: | |
print(f"Error downloading index.pkl: {e}") | |
return None | |
# Load vector store | |
embeddings = HuggingFaceEmbeddings( | |
model_name="LazarusNLP/all-indo-e5-small-v4", | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs={"normalize_embeddings": True} | |
) | |
vector_store = FAISS.load_local( | |
local_path, | |
embeddings, | |
allow_dangerous_deserialization=True | |
) | |
print(f"Vector store loaded from Supabase bucket: {bucket_name}") | |
return vector_store | |
except Exception as e: | |
print(f"Error loading vector store from Supabase: {e}") | |
st.error(f"Error loading from Supabase: {e}") | |
return None | |
def process_documents(docs): | |
embeddings = HuggingFaceEmbeddings( | |
model_name="LazarusNLP/all-indo-e5-small-v4", | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs={"normalize_embeddings": True} | |
) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1500, | |
chunk_overlap=300 | |
) | |
text_chunks = text_splitter.split_documents(docs) | |
vector_store = FAISS.from_documents(text_chunks, embeddings) | |
return vector_store |