ZarinT commited on
Commit
fc5e45c
Β·
verified Β·
1 Parent(s): ff996e6

Create create_monte_vectorstore.py

Browse files
Files changed (1) hide show
  1. create_monte_vectorstore.py +60 -0
create_monte_vectorstore.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from bs4 import BeautifulSoup
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.embeddings.base import Embeddings
6
+ import google.generativeai as genai
7
+
8
+ # πŸ”‘ Embed with Gemini
9
+ class GeminiEmbeddings(Embeddings):
10
+ def __init__(self, model_name="models/embedding-001", api_key=None):
11
+ api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed
12
+ os.environ["GOOGLE_API_KEY"] = api_key
13
+ genai.configure(api_key=api_key)
14
+ self.model_name = model_name
15
+
16
+ def embed_documents(self, texts):
17
+ return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]
18
+ for text in texts]
19
+
20
+ def embed_query(self, text):
21
+ return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"]
22
+
23
+ # πŸ“ Your uploaded HTML directory (change this if needed)
24
+ HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored
25
+ OUTPUT_DIR = "monte_vectorstore"
26
+
27
+ # 🧼 Step 1: Parse all HTML files
28
+ def parse_html_files(folder):
29
+ chunks = []
30
+ for file in os.listdir(folder):
31
+ if file.endswith(".html") or file.endswith(".htm"):
32
+ with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
33
+ soup = BeautifulSoup(f, "lxml")
34
+ text = soup.get_text(separator="\n").strip()
35
+ chunks.append({"text": text, "document": file})
36
+ return chunks
37
+
38
+ # βœ‚οΈ Step 2: Split into text chunks
39
+ def split_into_chunks(raw_chunks):
40
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100)
41
+ split_data = []
42
+ for chunk in raw_chunks:
43
+ for part in splitter.split_text(chunk["text"]):
44
+ split_data.append({"text": part, "document": chunk["document"]})
45
+ return split_data
46
+
47
+ # πŸ” Step 3: Embed and save FAISS vectorstore
48
+ def save_faiss(split_chunks, output_dir):
49
+ texts = [item["text"] for item in split_chunks]
50
+ metadatas = [{"document": item["document"]} for item in split_chunks]
51
+ embeddings = GeminiEmbeddings()
52
+ vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
53
+ vectorstore.save_local(output_dir)
54
+ print(f"βœ… Saved vectorstore to {output_dir}/index.faiss and index.pkl")
55
+
56
+ # πŸš€ Main
57
+ if __name__ == "__main__":
58
+ raw = parse_html_files(HTML_DIR)
59
+ chunks = split_into_chunks(raw)
60
+ save_faiss(chunks, OUTPUT_DIR)