import os import uuid import pandas as pd from qdrant_client import QdrantClient, models from sentence_transformers import SentenceTransformer # === Step 1: Ensure Qdrant directory exists === if not os.path.exists("qdrant_data"): os.makedirs("qdrant_data") # === Step 2: Load dataset === data = pd.read_csv("math_dataset (2).csv") # Ensure this CSV is present and formatted correctly # === Step 3: Encode questions === embedding_model = SentenceTransformer("intfloat/e5-large") vectors = embedding_model.encode(data["problem"].tolist(), show_progress_bar=True) # === Step 4: Initialize local Qdrant client === client = QdrantClient(path="qdrant_data") # === Step 5: Create collection (recreate ensures it's fresh) === collection_name = "math_problems" client.recreate_collection( collection_name=collection_name, vectors_config=models.VectorParams(size=vectors.shape[1], distance=models.Distance.COSINE) ) # === Step 6: Prepare payload and upload with UUIDs === payload = data.to_dict(orient="records") ids = [str(uuid.uuid4()) for _ in range(len(vectors))] client.upload_collection( collection_name=collection_name, vectors=vectors, payload=payload, ids=ids ) print("✅ Qdrant vector store created and populated successfully in `qdrant_data/`.")