Spaces:
Running
Running
import os | |
from bs4 import BeautifulSoup | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
from langchain.embeddings.base import Embeddings | |
import google.generativeai as genai | |
# π Embed with Gemini | |
class GeminiEmbeddings(Embeddings): | |
def __init__(self, model_name="models/embedding-001", api_key=None): | |
api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed | |
os.environ["GOOGLE_API_KEY"] = api_key | |
genai.configure(api_key=api_key) | |
self.model_name = model_name | |
def embed_documents(self, texts): | |
return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"] | |
for text in texts] | |
def embed_query(self, text): | |
return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"] | |
# π Your uploaded HTML directory (change this if needed) | |
HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored | |
OUTPUT_DIR = "monte_vectorstore" | |
# π§Ό Step 1: Parse all HTML files | |
def parse_html_files(folder): | |
chunks = [] | |
for file in os.listdir(folder): | |
if file.endswith(".html") or file.endswith(".htm"): | |
with open(os.path.join(folder, file), "r", encoding="utf-8") as f: | |
soup = BeautifulSoup(f, "lxml") | |
text = soup.get_text(separator="\n").strip() | |
chunks.append({"text": text, "document": file}) | |
return chunks | |
# βοΈ Step 2: Split into text chunks | |
def split_into_chunks(raw_chunks): | |
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100) | |
split_data = [] | |
for chunk in raw_chunks: | |
for part in splitter.split_text(chunk["text"]): | |
split_data.append({"text": part, "document": chunk["document"]}) | |
return split_data | |
# π Step 3: Embed and save FAISS vectorstore | |
def save_faiss(split_chunks, output_dir): | |
texts = [item["text"] for item in split_chunks] | |
metadatas = [{"document": item["document"]} for item in split_chunks] | |
embeddings = GeminiEmbeddings() | |
vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas) | |
vectorstore.save_local(output_dir) | |
print(f"β Saved vectorstore to {output_dir}/index.faiss and index.pkl") | |
# π Main | |
if __name__ == "__main__": | |
raw = parse_html_files(HTML_DIR) | |
chunks = split_into_chunks(raw) | |
save_faiss(chunks, OUTPUT_DIR) | |