File size: 4,415 Bytes
2af0eb7
 
 
 
24ff9b2
2af0eb7
 
 
24ff9b2
 
 
 
 
 
 
 
 
 
 
 
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ff9b2
2af0eb7
24ff9b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.12/site-packages/pymongo/synchronous/collection.py:1920: UserWarning: use an explicit session with no_cursor_timeout=True otherwise the cursor may still timeout after 30 minutes, for more info see https://mongodb.com/docs/v4.4/reference/method/cursor.noCursorTimeout/#session-idle-timeout-overrides-nocursortimeout\n",
      "  return Cursor(self, *args, **kwargs)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stopping document loop\n",
      "Stopping document loop\n"
     ]
    }
   ],
   "source": [
    "# See README for more info on how the FeaturePipeline works\n",
    "# The Ingestion pipeline is part of the FeaturePipeline\n",
    "# Make sure to ollama serve before running!\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from qdrant_client.http.models import Distance, VectorParams, PointStruct\n",
    "from shared import getMongoClient, getQdrantClient, getEmbeddingsModel\n",
    "\n",
    "# Create a mongoDB connection\n",
    "mongoHost = getMongoClient()\n",
    "\n",
    "# Create a qdrant connection\n",
    "qClient = getQdrantClient()\n",
    "\n",
    "# Create qdrant collections to store embeddings\n",
    "if not qClient.collection_exists(\"Github\"):\n",
    "    qClient.create_collection(\n",
    "        collection_name=\"Github\",\n",
    "        vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
    "    )\n",
    "if not qClient.collection_exists(\"Document\"):\n",
    "    qClient.create_collection(\n",
    "        collection_name=\"Document\",\n",
    "        vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
    "    )\n",
    "\n",
    "# Ingestion Pipeline Setup\n",
    "# Define a text cleaner\n",
    "def cleanText(text):\n",
    "    return ''.join(char for char in text if 32 <= ord(char) <= 126)\n",
    "\n",
    "# Setup the text chunker\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=500,\n",
    "    chunk_overlap=20,\n",
    "    length_function=len,\n",
    "    is_separator_regex=False,\n",
    ")\n",
    "\n",
    "# Setup the text embedder\n",
    "embeddingsModel = getEmbeddingsModel()\n",
    "\n",
    "# Running the ingestion pipeline\n",
    "# Store all documents from each MongoDB collection into qdrant\n",
    "mongoDatabase =  mongoHost[\"twin\"]\n",
    "collections = mongoDatabase.list_collection_names()\n",
    "for collection in collections:\n",
    "    mongoCollection = mongoDatabase[collection]\n",
    "\n",
    "    documents = mongoCollection.find(no_cursor_timeout=True)\n",
    "    id = 0\n",
    "    try:\n",
    "        for document in documents:\n",
    "            # For each document, split it into chunks\n",
    "            link = document[\"link\"]\n",
    "            resultType = document[\"type\"]\n",
    "            text = document[\"content\"]\n",
    "            text = cleanText(text)\n",
    "            chunks = text_splitter.split_text(text)\n",
    "            chunkNum = 0\n",
    "            embeddings = embeddingsModel.embed_documents(chunks)\n",
    "            for chunk in chunks:\n",
    "                # Create embeddings for each chunk, of length 3072 using the embedding model\n",
    "                # Store the embedding along with some metadata into the Qdrant vector database\n",
    "                qClient.upsert(collection_name=resultType, wait=True, points=[PointStruct(id=id, vector=embeddings[chunkNum], payload={\"link\": link, \"type\": resultType, \"chunk\": chunkNum, \"text\": chunk})])\n",
    "                chunkNum += 1\n",
    "                id += 1\n",
    "    except:\n",
    "        print(\"Stopping document loop\")\n",
    "    \n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}