In [33]:
from shared import getQdrantClient, getEmbeddingsModel
qClient = getQdrantClient()

# Show everything in the Document collection
numDocumentChunks = 0
# Note with_vectors defaults to false, so the vectors are not returned
chunks = qClient.scroll(collection_name='Document', limit=100)
while True:
    for chunk in chunks[0]:
        if numDocumentChunks == 0:
            sampleDocumentChunk = chunk
        numDocumentChunks += 1
    chunks = qClient.scroll(collection_name='Document', limit=100, with_payload=False, offset=chunks[1])
    if chunks[1] is None:
        break
print("Number of document chunks: ", numDocumentChunks)
if numDocumentChunks > 0:
    print("\nSample document chunk(metadata not the vector): ")
    print(sampleDocumentChunk, '\n')

# Show everything in the Github collection
numGithubChunks = 0
# Note with_vectors defaults to false, so the vectors are not returned(since they are very large)
chunks = qClient.scroll(collection_name='Github', limit=100)
while True:
    for chunk in chunks[0]:
        if numGithubChunks == 0:
            sampleGithubChunk = chunk
        numGithubChunks += 1
    chunks = qClient.scroll(collection_name='Github', limit=100, with_payload=False, offset=chunks[1])
    if chunks[1] is None:
        break
print("Number of githb chunks: ", numGithubChunks)
if numGithubChunks > 0:
    print("\nSample github chunk(with_vector=false): ")
    print(sampleGithubChunk, '\n')

# Show a sample search
embeddingsModel = getEmbeddingsModel()
results = qClient.search(
    collection_name="Document",
    query_vector = embeddingsModel.embed_query("How many companies is Nav2 trusted by worldwide?"),
    limit=10
)
print("\nSample search result(n=10): ")
for result in results:
    print(result)

Number of document chunks:  14800

Sample document chunk(metadata not the vector): 
id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\'s all open source. What is ROS? ROS Videos " Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None 

Number of githb chunks:  3600

Sample github chunk(with_vector=false): 
id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': "#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplicatio

In [34]:
# Check how many chunks total will be processed by the FeaturePipeline
from shared import getMongoClient
from langchain_text_splitters import RecursiveCharacterTextSplitter


texts = []
# Create a mongoDB connection
mongoHost = getMongoClient()
mongoDatabase = mongoHost["twin"]
collections = mongoDatabase.list_collection_names()
for collection in collections:
    mongoCollection = mongoDatabase[collection]
    results = mongoCollection.find()
    for result in results:
        # For each document, split it into chunks
        texts.append(result["content"])

cleanTexts = []
for text in texts:
    cleanTexts.append("".join(char for char in text if 32 <= ord(char) <= 126))

numChunks = 0
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
for text in cleanTexts:
    textChunks = text_splitter.split_text(text)
    for chunk in textChunks:
        numChunks += 1

print("Total number of chunks to embed: ", numChunks)
print("Chunks currently embedded: ", numDocumentChunks+numGithubChunks)

Total number of chunks to embed:  285569
Chunks currently embedded:  18400


In [17]:
import numpy as np
# How cosine distance works

queryEmbedding = embeddingsModel.embed_query("What is the weather like?")
documentEmbedding = embeddingsModel.embed_documents(["It is raining today.", "ROS is an open source platform"])
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)
similarity1 = cosine_similarity(queryEmbedding, documentEmbedding[0])
similarity2 = cosine_similarity(queryEmbedding, documentEmbedding[1])
print("Cosine Similarity for related sentences:", similarity1)
print("Cosine Similarity for unrelated sentences:", similarity2)

Cosine Similarity for related sentences: 0.523006986899456
Cosine Similarity for unrelated sentences: 0.32259653091273344


In [19]:
from qdrant_client.http.models import Distance, VectorParams
# Delete all collections and vectors inside them
qClient.delete_collection(collection_name = "Document")
qClient.delete_collection(collection_name = "Github")
# Recreate the empty collections
qClient.create_collection(
    collection_name = "Document",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
)
qClient.create_collection(
    collection_name = "Github",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
)

True