Spaces:

ZarinT
/

ScientificChatbot

Running

App Files Files Community

ZarinT commited on 7 days ago

Commit

fc5e45c

verified ·

1 Parent(s): ff996e6

Create create_monte_vectorstore.py

Browse files

Files changed (1) hide show

create_monte_vectorstore.py +60 -0

create_monte_vectorstore.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+from bs4 import BeautifulSoup
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings.base import Embeddings
+import google.generativeai as genai
+# 🔑 Embed with Gemini
+class GeminiEmbeddings(Embeddings):
+    def __init__(self, model_name="models/embedding-001", api_key=None):
+        api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here")  # Replace if needed
+        os.environ["GOOGLE_API_KEY"] = api_key
+        genai.configure(api_key=api_key)
+        self.model_name = model_name
+    def embed_documents(self, texts):
+        return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]
+                for text in texts]
+    def embed_query(self, text):
+        return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"]
+# 📁 Your uploaded HTML directory (change this if needed)
+HTML_DIR = "monte_docs"  # folder where HTML files like refguide.html are stored
+OUTPUT_DIR = "monte_vectorstore"
+# 🧼 Step 1: Parse all HTML files
+def parse_html_files(folder):
+    chunks = []
+    for file in os.listdir(folder):
+        if file.endswith(".html") or file.endswith(".htm"):
+            with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
+                soup = BeautifulSoup(f, "lxml")
+                text = soup.get_text(separator="\n").strip()
+                chunks.append({"text": text, "document": file})
+    return chunks
+# ✂️ Step 2: Split into text chunks
+def split_into_chunks(raw_chunks):
+    splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100)
+    split_data = []
+    for chunk in raw_chunks:
+        for part in splitter.split_text(chunk["text"]):
+            split_data.append({"text": part, "document": chunk["document"]})
+    return split_data
+# 🔁 Step 3: Embed and save FAISS vectorstore
+def save_faiss(split_chunks, output_dir):
+    texts = [item["text"] for item in split_chunks]
+    metadatas = [{"document": item["document"]} for item in split_chunks]
+    embeddings = GeminiEmbeddings()
+    vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
+    vectorstore.save_local(output_dir)
+    print(f"✅ Saved vectorstore to {output_dir}/index.faiss and index.pkl")
+# 🚀 Main
+if __name__ == "__main__":
+    raw = parse_html_files(HTML_DIR)
+    chunks = split_into_chunks(raw)
+    save_faiss(chunks, OUTPUT_DIR)