import os from bs4 import BeautifulSoup from langchain.text_splitter import CharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain.embeddings.base import Embeddings import google.generativeai as genai # ๐Ÿ”‘ Embed with Gemini class GeminiEmbeddings(Embeddings): def __init__(self, model_name="models/embedding-001", api_key=None): api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed os.environ["GOOGLE_API_KEY"] = api_key genai.configure(api_key=api_key) self.model_name = model_name def embed_documents(self, texts): return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"] for text in texts] def embed_query(self, text): return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"] # ๐Ÿ“ Your uploaded HTML directory (change this if needed) HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored OUTPUT_DIR = "monte_vectorstore" # ๐Ÿงผ Step 1: Parse all HTML files def parse_html_files(folder): chunks = [] for file in os.listdir(folder): if file.endswith(".html") or file.endswith(".htm"): with open(os.path.join(folder, file), "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "lxml") text = soup.get_text(separator="\n").strip() chunks.append({"text": text, "document": file}) return chunks # โœ‚๏ธ Step 2: Split into text chunks def split_into_chunks(raw_chunks): splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100) split_data = [] for chunk in raw_chunks: for part in splitter.split_text(chunk["text"]): split_data.append({"text": part, "document": chunk["document"]}) return split_data # ๐Ÿ” Step 3: Embed and save FAISS vectorstore def save_faiss(split_chunks, output_dir): texts = [item["text"] for item in split_chunks] metadatas = [{"document": item["document"]} for item in split_chunks] embeddings = GeminiEmbeddings() vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas) vectorstore.save_local(output_dir) print(f"โœ… Saved vectorstore to {output_dir}/index.faiss and index.pkl") # ๐Ÿš€ Main if __name__ == "__main__": raw = parse_html_files(HTML_DIR) chunks = split_into_chunks(raw) save_faiss(chunks, OUTPUT_DIR)