Spaces:

ZarinT
/

ScientificChatbot

Running

App Files Files Community

ScientificChatbot / create_monte_vectorstore.py

ZarinT

Create create_monte_vectorstore.py

fc5e45c verified 3 days ago

raw

history blame contribute delete

2.52 kB

	import os
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain.embeddings.base import Embeddings
	import google.generativeai as genai

	# 🔑 Embed with Gemini
	class GeminiEmbeddings(Embeddings):
	def __init__(self, model_name="models/embedding-001", api_key=None):
	api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed
	os.environ["GOOGLE_API_KEY"] = api_key
	genai.configure(api_key=api_key)
	self.model_name = model_name

	def embed_documents(self, texts):
	return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]
	for text in texts]

	def embed_query(self, text):
	return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"]

	# 📁 Your uploaded HTML directory (change this if needed)
	HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored
	OUTPUT_DIR = "monte_vectorstore"

	# 🧼 Step 1: Parse all HTML files
	def parse_html_files(folder):
	chunks = []
	for file in os.listdir(folder):
	if file.endswith(".html") or file.endswith(".htm"):
	with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
	soup = BeautifulSoup(f, "lxml")
	text = soup.get_text(separator="\n").strip()
	chunks.append({"text": text, "document": file})
	return chunks

	# ✂️ Step 2: Split into text chunks
	def split_into_chunks(raw_chunks):
	splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100)
	split_data = []
	for chunk in raw_chunks:
	for part in splitter.split_text(chunk["text"]):
	split_data.append({"text": part, "document": chunk["document"]})
	return split_data

	# 🔁 Step 3: Embed and save FAISS vectorstore
	def save_faiss(split_chunks, output_dir):
	texts = [item["text"] for item in split_chunks]
	metadatas = [{"document": item["document"]} for item in split_chunks]
	embeddings = GeminiEmbeddings()
	vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
	vectorstore.save_local(output_dir)
	print(f"✅ Saved vectorstore to {output_dir}/index.faiss and index.pkl")

	# 🚀 Main
	if __name__ == "__main__":
	raw = parse_html_files(HTML_DIR)
	chunks = split_into_chunks(raw)
	save_faiss(chunks, OUTPUT_DIR)