Spaces:
Running
Running
Create create_monte_vectorstore.py
Browse files- create_monte_vectorstore.py +60 -0
create_monte_vectorstore.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from langchain.text_splitter import CharacterTextSplitter
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain.embeddings.base import Embeddings
|
6 |
+
import google.generativeai as genai
|
7 |
+
|
8 |
+
# π Embed with Gemini
|
9 |
+
class GeminiEmbeddings(Embeddings):
|
10 |
+
def __init__(self, model_name="models/embedding-001", api_key=None):
|
11 |
+
api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed
|
12 |
+
os.environ["GOOGLE_API_KEY"] = api_key
|
13 |
+
genai.configure(api_key=api_key)
|
14 |
+
self.model_name = model_name
|
15 |
+
|
16 |
+
def embed_documents(self, texts):
|
17 |
+
return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]
|
18 |
+
for text in texts]
|
19 |
+
|
20 |
+
def embed_query(self, text):
|
21 |
+
return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"]
|
22 |
+
|
23 |
+
# π Your uploaded HTML directory (change this if needed)
|
24 |
+
HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored
|
25 |
+
OUTPUT_DIR = "monte_vectorstore"
|
26 |
+
|
27 |
+
# π§Ό Step 1: Parse all HTML files
|
28 |
+
def parse_html_files(folder):
|
29 |
+
chunks = []
|
30 |
+
for file in os.listdir(folder):
|
31 |
+
if file.endswith(".html") or file.endswith(".htm"):
|
32 |
+
with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
|
33 |
+
soup = BeautifulSoup(f, "lxml")
|
34 |
+
text = soup.get_text(separator="\n").strip()
|
35 |
+
chunks.append({"text": text, "document": file})
|
36 |
+
return chunks
|
37 |
+
|
38 |
+
# βοΈ Step 2: Split into text chunks
|
39 |
+
def split_into_chunks(raw_chunks):
|
40 |
+
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100)
|
41 |
+
split_data = []
|
42 |
+
for chunk in raw_chunks:
|
43 |
+
for part in splitter.split_text(chunk["text"]):
|
44 |
+
split_data.append({"text": part, "document": chunk["document"]})
|
45 |
+
return split_data
|
46 |
+
|
47 |
+
# π Step 3: Embed and save FAISS vectorstore
|
48 |
+
def save_faiss(split_chunks, output_dir):
|
49 |
+
texts = [item["text"] for item in split_chunks]
|
50 |
+
metadatas = [{"document": item["document"]} for item in split_chunks]
|
51 |
+
embeddings = GeminiEmbeddings()
|
52 |
+
vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
|
53 |
+
vectorstore.save_local(output_dir)
|
54 |
+
print(f"β
Saved vectorstore to {output_dir}/index.faiss and index.pkl")
|
55 |
+
|
56 |
+
# π Main
|
57 |
+
if __name__ == "__main__":
|
58 |
+
raw = parse_html_files(HTML_DIR)
|
59 |
+
chunks = split_into_chunks(raw)
|
60 |
+
save_faiss(chunks, OUTPUT_DIR)
|