Spaces:
Running
Running
File size: 2,517 Bytes
fc5e45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
import google.generativeai as genai
# π Embed with Gemini
class GeminiEmbeddings(Embeddings):
def __init__(self, model_name="models/embedding-001", api_key=None):
api_key = os.getenv("GOOGLE_API_KEY", "your-api-key-here") # Replace if needed
os.environ["GOOGLE_API_KEY"] = api_key
genai.configure(api_key=api_key)
self.model_name = model_name
def embed_documents(self, texts):
return [genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]
for text in texts]
def embed_query(self, text):
return genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")["embedding"]
# π Your uploaded HTML directory (change this if needed)
HTML_DIR = "monte_docs" # folder where HTML files like refguide.html are stored
OUTPUT_DIR = "monte_vectorstore"
# π§Ό Step 1: Parse all HTML files
def parse_html_files(folder):
chunks = []
for file in os.listdir(folder):
if file.endswith(".html") or file.endswith(".htm"):
with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "lxml")
text = soup.get_text(separator="\n").strip()
chunks.append({"text": text, "document": file})
return chunks
# βοΈ Step 2: Split into text chunks
def split_into_chunks(raw_chunks):
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=100)
split_data = []
for chunk in raw_chunks:
for part in splitter.split_text(chunk["text"]):
split_data.append({"text": part, "document": chunk["document"]})
return split_data
# π Step 3: Embed and save FAISS vectorstore
def save_faiss(split_chunks, output_dir):
texts = [item["text"] for item in split_chunks]
metadatas = [{"document": item["document"]} for item in split_chunks]
embeddings = GeminiEmbeddings()
vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
vectorstore.save_local(output_dir)
print(f"β
Saved vectorstore to {output_dir}/index.faiss and index.pkl")
# π Main
if __name__ == "__main__":
raw = parse_html_files(HTML_DIR)
chunks = split_into_chunks(raw)
save_faiss(chunks, OUTPUT_DIR)
|