File size: 5,039 Bytes
2af0eb7
 
 
 
4779f10
2af0eb7
 
 
 
 
 
4779f10
 
2af0eb7
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
4779f10
 
 
 
 
 
 
 
 
2af0eb7
 
 
 
 
 
 
4779f10
 
 
 
 
 
 
 
 
 
 
2af0eb7
4779f10
2af0eb7
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of document chunks:  0\n",
      "Number of githb chunks:  0\n",
      "\n",
      "Sample search result(n=2): \n"
     ]
    }
   ],
   "source": [
    "from shared import getQdrantClient, getEmbeddingsModel\n",
    "qClient = getQdrantClient()\n",
    "\n",
    "# Show everything in the Document collection\n",
    "numDocumentChunks = 0\n",
    "# Note with_vectors defaults to false, so the vectors are not returned\n",
    "chunks = qClient.scroll(collection_name='Document', limit=100)\n",
    "while True:\n",
    "    for chunk in chunks[0]:\n",
    "        if numDocumentChunks == 0:\n",
    "            sampleDocumentChunk = chunk\n",
    "        numDocumentChunks += 1\n",
    "    chunks = qClient.scroll(collection_name='Document', limit=100, with_payload=False, offset=chunks[1])\n",
    "    if chunks[1] is None:\n",
    "        break\n",
    "print(\"Number of document chunks: \", numDocumentChunks)\n",
    "if numDocumentChunks > 0:\n",
    "    print(\"\\nSample document chunk(metadata not the vector): \")\n",
    "    print(sampleDocumentChunk, '\\n')\n",
    "\n",
    "# Show everything in the Github collection\n",
    "numGithubChunks = 0\n",
    "# Note with_vectors defaults to false, so the vectors are not returned\n",
    "chunks = qClient.scroll(collection_name='Github', limit=100)\n",
    "while True:\n",
    "    for chunk in chunks[0]:\n",
    "        if numGithubChunks == 0:\n",
    "            sampleGithubChunk = chunk\n",
    "        numGithubChunks += 1\n",
    "    chunks = qClient.scroll(collection_name='Github', limit=100, with_payload=False, offset=chunks[1])\n",
    "    if chunks[1] is None:\n",
    "        break\n",
    "print(\"Number of githb chunks: \", numDocumentChunks)\n",
    "if numGithubChunks > 0:\n",
    "    print(\"\\nSample github chunk(metadata not the vector): \")\n",
    "    print(sampleGithubChunk, '\\n')\n",
    "\n",
    "# Show a sample search\n",
    "embeddingsModel = getEmbeddingsModel()\n",
    "results = qClient.search(\n",
    "    collection_name=\"Document\",\n",
    "    query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
    "    limit=10\n",
    ")\n",
    "print(\"\\nSample search result(n=2): \")\n",
    "for result in results:\n",
    "    print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cosine Similarity for related sentences: 0.7035977848391597\n",
      "Cosine Similarity for unrelated sentences: 0.3566534327076298\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "# How cosine distance works\n",
    "\n",
    "embedding1 = embeddingsModel.embed_query(\"What is the weather like?\")\n",
    "embedding2 = embeddingsModel.embed_query(\"It is raining today.\")\n",
    "embedding3 = embeddingsModel.embed_query(\"ROS is an open source platform\")\n",
    "def cosine_similarity(vec1, vec2):\n",
    "    dot_product = np.dot(vec1, vec2)\n",
    "    norm_vec1 = np.linalg.norm(vec1)\n",
    "    norm_vec2 = np.linalg.norm(vec2)\n",
    "    return dot_product / (norm_vec1 * norm_vec2)\n",
    "similarity1 = cosine_similarity(embedding1, embedding2)\n",
    "similarity2 = cosine_similarity(embedding1, embedding3)\n",
    "print(\"Cosine Similarity for related sentences:\", similarity1)\n",
    "print(\"Cosine Similarity for unrelated sentences:\", similarity2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from qdrant_client.http.models import Distance, VectorParams\n",
    "# Delete all collections and vectors inside them\n",
    "qClient.delete_collection(collection_name = \"Document\")\n",
    "qClient.delete_collection(collection_name = \"Github\")\n",
    "# Recreate the empty collections\n",
    "qClient.create_collection(\n",
    "    collection_name = \"Document\",\n",
    "    vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
    ")\n",
    "qClient.create_collection(\n",
    "    collection_name = \"Github\",\n",
    "    vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}