KenTheNoob commited on
Commit
2af0eb7
·
1 Parent(s): 90d383a

Moving HF spaces to HF model

Browse files
.devcontainer/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM mcr.microsoft.com/devcontainers/python:1-3.12-bullseye
2
+
3
+ RUN pip install --no-cache-dir python-dotenv==1.0.1 langchain==0.3.10 langchain-community==0.3.10 \
4
+ qdrant-client==1.12.1 gradio==5.8.0 pymongo==4.10.1 requests==2.32.3 bs4==0.0.2 ipykernel==6.29.5 ipython==8.27.0 \
5
+ ipywidgets==8.1.5 jupyter==1.1.1 jupyter-client==8.6.2 jupyter-console==6.6.3 jupyter-core==5.7.2 jupyter-server==2.14.2 \
6
+ jupyter-events==0.10.0 jupyter-lsp==2.2.5 jupyter-server-terminals==0.5.3 jupyterlab==4.2.5 jupyterlab-pygments==0.3.0 \
7
+ jupyterlab-quarto==0.3.5 jupyterlab-server==2.27.3 jupyterlab-widgets==3.0.13 langchain-openai==0.2.11 clearml==1.16.4
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // For format details, see https://aka.ms/devcontainer.json. For config options, see the
2
+ // README at: https://github.com/devcontainers/templates/tree/main/src/python
3
+ {
4
+ "name": "Python 3",
5
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
6
+ "build": {
7
+ // Path is relative to the devcontainer.json file.
8
+ "dockerfile": "Dockerfile"
9
+ },
10
+
11
+ // Features to add to the dev container. More info: https://containers.dev/features.
12
+ // "features": {},
13
+
14
+ // Use 'forwardPorts' to make a list of ports inside the container available locally.
15
+ // "forwardPorts": [],
16
+
17
+ // Use 'postCreateCommand' to run commands after the container is created.
18
+ // "postCreateCommand": "pip3 install --user -r requirements.txt",
19
+
20
+ // Configure tool-specific properties.
21
+ "customizations": {
22
+ // Install jupyter, mongodb, and docker
23
+ "vscode": {
24
+ "extensions": [
25
+ "ms-toolsai.jupyter",
26
+ "mongodb.mongodb-vscode",
27
+ "ms-azuretools.vscode-docker"
28
+ ],
29
+ "settings": {
30
+
31
+ }
32
+ }
33
+ },
34
+
35
+ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
36
+ "remoteUser": "root"
37
+ }
project/.env.example ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MongoDB database
2
+ # Change DATABASE_HOST if you run mongodb from the web
3
+ # https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup
4
+ DATABASE_HOST="mongodb://host.docker.internal/27017"
5
+
6
+ # Qdrant vector database
7
+ # USE_QDRANT_CLOUD="False" if you setup the qdrant docker container(docker compose)
8
+ # USE_QDRANT_CLOUD="True" if you run qdrant from the web
9
+ USE_QDRANT_CLOUD="False"
10
+ # Change QDRANT_CLOUD_URL and fill out QDRANT_APIKEY if you run qdrant from the web
11
+ # https://qdrant.tech/documentation/cloud/create-cluster/
12
+ QDRANT_CLOUD_URL="host.docker.internal:6333"
13
+ QDRANT_APIKEY=your_qdrant_apikey
14
+
15
+
16
+ # Ollama
17
+ # USE_DOCKER="True" if you setup the ollama docker container(docker compose)
18
+ # USE_DOCKER="False" if you run ollama serve from command line
19
+ USE_DOCKER="True"
20
+
21
+ # ClearML(optional for running files in ClearML folder)
22
+ # https://clear.ml/docs/latest/docs/clearml_serving/clearml_serving_setup/
23
+ CLEARML_WEB_HOST=your_clearml_web_host(link)
24
+ CLEARML_API_HOST=your_clearml_api_host(link)
25
+ CLEARML_FILES_HOST=your_clearml_files_host(link)
26
+ CLEARML_API_ACCESS_KEY=your_clearml_api_access_key(str)
27
+ CLEARML_API_SECRET_KEY=your_clearml_api_secret_key(str)
28
+
29
+ # --- OpenAI is not used for this project(optional), but a function to pull the model is given. ---
30
+
31
+ # OpenAI API Config(unused)
32
+ # https://platform.openai.com/api-keys
33
+ OPENAI_MODEL_ID=gpt-4o-mini
34
+ OPENAI_API_KEY=your_openai_api_key
35
+
36
+ # Huggingface API Config(unused)
37
+ # https://huggingface.co/docs/hub/en/security-tokens
38
+ HUGGINGFACE_ACCESS_TOKEN=your_huggingface_access_token
project/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .gradio
2
+ .env
project/ClearML/DataCollectionPipeline.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See README for more info on how the DataCollectionPipeline works
2
+ # The ETL pipeline is part of the DataCollectionPipeline
3
+ # Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often
4
+ import os
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ import time
10
+
11
+ import pymongo
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ from clearml import PipelineDecorator
15
+ from dotenv import load_dotenv
16
+
17
+ # Setup ClearML
18
+ try:
19
+ load_dotenv(override=True)
20
+ except Exception:
21
+ load_dotenv(sys.path[1] + "/.env", override=True)
22
+ CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
23
+ CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
24
+ CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
25
+ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
26
+ CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
27
+
28
+ # Input into the Data Collection Pipeline is a list of links to domains
29
+ links = [
30
+ "https://www.ros.org/",
31
+ "https://docs.nav2.org/",
32
+ "https://moveit.ai/",
33
+ "https://gazebosim.org/home",
34
+ "https://github.com/ros2/ros2",
35
+ "https://github.com/ros-navigation/navigation2",
36
+ "https://github.com/moveit/moveit2",
37
+ "https://github.com/gazebosim/gazebo-classic",
38
+ ]
39
+ links = ["https://www.ros.org/", "https://github.com/ros2/ros2"]
40
+
41
+
42
+ # ETL pipeline
43
+ @PipelineDecorator.component(cache=False, return_values=["documents, codes"])
44
+ def ETL_Pipeline(links):
45
+ # Create a mongoDB connection to check for duplicates before inserting
46
+ try:
47
+ load_dotenv(override=True)
48
+ except Exception:
49
+ load_dotenv(sys.path[1] + "/.env", override=True)
50
+ DATABASE_HOST = os.getenv("DATABASE_HOST")
51
+ mongoHost = pymongo.MongoClient(DATABASE_HOST)
52
+ mongoDatabase = mongoHost["twin"]
53
+
54
+ # Extract data from links and their subdirectories(using crawlers)
55
+ documents = []
56
+ codes = []
57
+ for link in links:
58
+ # Web scraper/crawler for github links
59
+ if "https://github.com" in link:
60
+ # Do not revisit a link already in the database
61
+ mongoCollection = mongoDatabase["Github"]
62
+ result = mongoCollection.find_one({"link": link})
63
+ if result is None:
64
+ # Modified GithubCrawler from LLM-Engineer for scraping github
65
+ local_temp = tempfile.mkdtemp()
66
+ try:
67
+ os.chdir(local_temp)
68
+ subprocess.run(["git", "clone", link])
69
+ repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
70
+ tree = {}
71
+ for root, _, files in os.walk(repo_path):
72
+ dir = root.replace(repo_path, "").lstrip("/")
73
+ if dir.startswith((".git", ".toml", ".lock", ".png")):
74
+ continue
75
+ for file in files:
76
+ if file.endswith((".git", ".toml", ".lock", ".png")):
77
+ continue
78
+ file_path = os.path.join(dir, file)
79
+ with open(
80
+ os.path.join(root, file), "r", errors="ignore"
81
+ ) as f:
82
+ tree[file_path] = f.read().replace(" ", "")
83
+ except Exception:
84
+ print(f"Error scrapping {link}")
85
+ finally:
86
+ shutil.rmtree(local_temp)
87
+ # Correct the link
88
+ r = requests.get(link)
89
+ soup = BeautifulSoup(r.content, "html.parser")
90
+ # Find the file path to any of the files in the repository
91
+ link_element = soup.find("a", attrs={"class": "Link--primary"})
92
+ path = link_element.get("href")
93
+ path = path.rsplit("/", 1)[0]
94
+ # Push all the subdirectories to mongo
95
+ for subdirectory in tree:
96
+ text = tree[subdirectory]
97
+ # Transform the data
98
+ # Get rid of repeating \n characters and spaces
99
+ text = text.replace("\t", " ")
100
+ text = text.replace("\n", " ")
101
+ text_len = len(text)
102
+ for i in range(text_len):
103
+ while (
104
+ i + 1 < text_len
105
+ and text[i] == " "
106
+ and text[i + 1] == " "
107
+ ):
108
+ text = text[:i] + text[i + 1 :]
109
+ text_len -= 1
110
+ codes.append(
111
+ {
112
+ "link": "https://github.com"
113
+ + path
114
+ + "/"
115
+ + subdirectory,
116
+ "type": "Github",
117
+ "content": text,
118
+ }
119
+ )
120
+ # Web scraper/crawler for other links(Documents)
121
+ else:
122
+ # Do not revisit a link already in the database
123
+ mongoCollection = mongoDatabase["Document"]
124
+ result = mongoCollection.find_one({"link": link})
125
+ if result is None:
126
+ # Get all text in the website
127
+ r = requests.get(link)
128
+ soup = BeautifulSoup(r.content, "html.parser")
129
+ soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
130
+ text = soup.get_text()
131
+ # Transform the data
132
+ # Get rid of repeating \n characters and spaces
133
+ text = text.replace("\t", " ")
134
+ text = text.replace("\n", " ")
135
+ text_len = len(text)
136
+ for i in range(text_len):
137
+ while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
138
+ text = text[:i] + text[i + 1 :]
139
+ text_len -= 1
140
+ documents.append({"link": link, "type": "Document", "content": text})
141
+ # Also crawl through all subdirectorys in the link(related links)
142
+ soup = BeautifulSoup(r.content, "html.parser")
143
+ subdirectories = [a.get("href") for a in soup.find_all("a")]
144
+ for subdirectory in subdirectories:
145
+ if (
146
+ subdirectory is not None
147
+ and mongoCollection.find_one({"link": link + subdirectory})
148
+ is not None
149
+ ):
150
+ links.append(link + subdirectory)
151
+ # Avoid spamming sites
152
+ time.sleep(1)
153
+ # Each document has a link, type(github or other), and content(text)
154
+ mongoCollection = mongoDatabase["Document"]
155
+ mongoCollection.insert_many(documents)
156
+ mongoCollection = mongoDatabase["Github"]
157
+ mongoCollection.insert_many(codes)
158
+ return documents, codes
159
+
160
+
161
+ # Allow ClearML to monitor and run the ETL pipeline
162
+ @PipelineDecorator.pipeline(
163
+ name="Data Collection Pipeline",
164
+ project="RAG LLM",
165
+ version="0.2",
166
+ )
167
+ def main():
168
+ return ETL_Pipeline(links)
169
+
170
+
171
+ if __name__ == "__main__":
172
+ PipelineDecorator.run_locally()
173
+ main()
project/ClearML/FeaturePipeline.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See README for more info on how the FeaturePipeline works
2
+ # The Ingestion pipeline is part of the FeaturePipeline
3
+ # Make sure to ollama serve before running!
4
+ import os
5
+ import sys
6
+
7
+ import pymongo
8
+ from clearml import PipelineDecorator
9
+ from dotenv import load_dotenv
10
+ from langchain_community.embeddings import OllamaEmbeddings
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+ from qdrant_client import QdrantClient
13
+ from qdrant_client.http.models import Distance, PointStruct, VectorParams
14
+
15
+ # Setup ClearML
16
+ try:
17
+ load_dotenv(override=True)
18
+ except Exception:
19
+ load_dotenv(sys.path[1] + "/.env", override=True)
20
+ CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
21
+ CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
22
+ CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
23
+ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
24
+ CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
25
+
26
+
27
+ @PipelineDecorator.component(cache=False, return_values=["links, resultTypes, texts"])
28
+ def retreiveDocuments():
29
+ links = []
30
+ resultTypes = []
31
+ texts = []
32
+ # Create a mongoDB connection
33
+ try:
34
+ load_dotenv(override=True)
35
+ except Exception:
36
+ load_dotenv(sys.path[1] + "/.env", override=True)
37
+ DATABASE_HOST = os.getenv("DATABASE_HOST")
38
+ mongoHost = pymongo.MongoClient(DATABASE_HOST)
39
+ mongoDatabase = mongoHost["twin"]
40
+ collections = mongoDatabase.list_collection_names()
41
+ for collection in collections:
42
+ mongoCollection = mongoDatabase[collection]
43
+ results = mongoCollection.find()
44
+ for result in results:
45
+ # For each document, split it into chunks
46
+ links.append(result["link"])
47
+ resultTypes.append(result["type"])
48
+ texts.append(result["content"])
49
+ return links, resultTypes, texts
50
+
51
+
52
+ @PipelineDecorator.component(cache=False, return_values=["cleanTexts"])
53
+ def cleanDocuments(texts):
54
+ cleanTexts = []
55
+ for text in texts:
56
+ cleanTexts.append("".join(char for char in text if 32 <= ord(char) <= 126))
57
+ return cleanTexts
58
+
59
+
60
+ @PipelineDecorator.component(cache=False, return_values=["chunks", "chunkNums"])
61
+ def chunkDocuments(texts):
62
+ chunks = []
63
+ chunkNums = []
64
+ text_splitter = RecursiveCharacterTextSplitter(
65
+ chunk_size=500,
66
+ chunk_overlap=20,
67
+ length_function=len,
68
+ is_separator_regex=False,
69
+ )
70
+ for text in texts:
71
+ textChunks = text_splitter.split_text(text)
72
+ chunkNum = 0
73
+ for chunk in textChunks:
74
+ chunks.append(text_splitter.split_text(chunk))
75
+ chunkNums.append(chunkNum)
76
+ chunkNum += 1
77
+ return chunks, chunkNums
78
+
79
+
80
+ @PipelineDecorator.component(cache=False, return_values=["embeddings"])
81
+ def embedChunks(chunks):
82
+ embeddings = []
83
+ # Setup the text embedder
84
+ MODEL = "llama3.2"
85
+ try:
86
+ load_dotenv(override=True)
87
+ except Exception:
88
+ load_dotenv(sys.path[1] + "/.env", override=True)
89
+ USE_DOCKER = os.getenv("USE_DOCKER")
90
+ if USE_DOCKER == "True":
91
+ embeddingsModel = OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
92
+ else:
93
+ embeddingsModel = OllamaEmbeddings(model=MODEL)
94
+ for chunk in chunks:
95
+ embeddings.append(embeddingsModel.embed_query(chunk))
96
+ return embeddings
97
+
98
+
99
+ @PipelineDecorator.component(cache=False)
100
+ def storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums):
101
+ # Create a qdrant connection
102
+ try:
103
+ load_dotenv(override=True)
104
+ except Exception:
105
+ load_dotenv(sys.path[1] + "/.env", override=True)
106
+ USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
107
+ QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
108
+ QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
109
+ if USE_QDRANT_CLOUD:
110
+ qClient = QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
111
+ else:
112
+ qClient = QdrantClient(url=QDRANT_CLOUD_URL)
113
+
114
+ # Create qdrant collections to store embeddings
115
+ if not qClient.collection_exists("Github"):
116
+ qClient.create_collection(
117
+ collection_name="Github",
118
+ vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
119
+ )
120
+ if not qClient.collection_exists("Document"):
121
+ qClient.create_collection(
122
+ collection_name="Document",
123
+ vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
124
+ )
125
+ chunkIndex = -1
126
+ documentIndex = -1
127
+ for chunkNum in chunkNums:
128
+ chunkIndex += 1
129
+ if chunkNum == 0:
130
+ documentIndex += 1
131
+ # Store all documents from each MongoDB collection into qdrant
132
+ # Create embeddings for each chunk, of length 2048 using the embedding model
133
+ # Store the embedding along with some metadata into the Qdrant vector database
134
+ qClient.upsert(
135
+ collection_name=resultTypes[documentIndex],
136
+ wait=True,
137
+ points=[
138
+ PointStruct(
139
+ id=chunkIndex,
140
+ vector=embeddings[chunkIndex],
141
+ payload={
142
+ "link": links[documentIndex],
143
+ "type": resultTypes[documentIndex],
144
+ "chunk": chunkNum,
145
+ "text": chunks[chunkIndex],
146
+ },
147
+ )
148
+ ],
149
+ )
150
+
151
+
152
+ # Ingestion Pipeline
153
+ @PipelineDecorator.pipeline(
154
+ name="Feature Pipeline",
155
+ project="RAG LLM",
156
+ version="0.2",
157
+ )
158
+ def main():
159
+ links, resultTypes, texts = retreiveDocuments()
160
+ texts = cleanDocuments(texts)
161
+ chunks, chunkNums = chunkDocuments(texts)
162
+ embeddings = embedChunks(chunks)
163
+ storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ PipelineDecorator.run_locally()
168
+ main()
project/ClearML/InferencePipeline.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See README for more info on how the DataCollectionPipeline works
2
+ # The retrieval pipeline is part of the DataCollectionPipeline
3
+ import os
4
+ import sys
5
+ from operator import itemgetter
6
+
7
+ from clearml import PipelineDecorator
8
+ from dotenv import load_dotenv
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_community.embeddings import OllamaEmbeddings
11
+ from langchain_community.llms import Ollama
12
+ from qdrant_client import QdrantClient
13
+
14
+ # Setup ClearML
15
+ try:
16
+ load_dotenv(override=True)
17
+ except Exception:
18
+ load_dotenv(sys.path[1] + "/.env", override=True)
19
+ CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
20
+ CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
21
+ CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
22
+ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
23
+ CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
24
+
25
+
26
+ # Query expansion(I only generate one additional prompt for simplicity)
27
+ @PipelineDecorator.component(cache=False, return_values=["newQuery"])
28
+ def queryExpansion(query):
29
+ # Setup the model
30
+ MODEL = "llama3.2"
31
+ try:
32
+ load_dotenv(override=True)
33
+ except Exception:
34
+ load_dotenv(sys.path[1] + "/.env", override=True)
35
+ USE_DOCKER = os.getenv("USE_DOCKER")
36
+ if USE_DOCKER == "True":
37
+ model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
38
+ else:
39
+ model = Ollama(model=MODEL)
40
+
41
+ template = """
42
+ Rewrite the prompt. The new prompt must offer a different perspective.
43
+ Do not change the meaning. Output only the rewritten prompt with no introduction.
44
+ Prompt: {prompt}
45
+ """
46
+ prompt = PromptTemplate.from_template(template)
47
+ chain = {"prompt": itemgetter("prompt")} | prompt | model
48
+ return chain.invoke({"prompt": query})
49
+
50
+
51
+ # Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)
52
+ @PipelineDecorator.component(cache=False, return_values=["codingQuestion"])
53
+ def selfQuerying(query):
54
+ # Setup the model
55
+ MODEL = "llama3.2"
56
+ try:
57
+ load_dotenv(override=True)
58
+ except Exception:
59
+ load_dotenv(sys.path[1] + "/.env", override=True)
60
+ USE_DOCKER = os.getenv("USE_DOCKER")
61
+ if USE_DOCKER == "True":
62
+ model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
63
+ else:
64
+ model = Ollama(model=MODEL)
65
+
66
+ template = """
67
+ You are an AI assistant. You must determine if the prompt requires code as the answer.
68
+ Output a 1 if it is or a 0 if it is not and nothing else.
69
+ Prompt: {prompt}
70
+ """
71
+ prompt = PromptTemplate.from_template(template)
72
+ chain = {"prompt": itemgetter("prompt")} | prompt | model
73
+ return chain.invoke({"prompt": query})
74
+
75
+
76
+ # Filtered vector search for each of the N=2 queries after expansion
77
+ @PipelineDecorator.component(cache=False, return_values=["results1, results2"])
78
+ def filteredVectorSearch(query, newQuery, codingQuestion):
79
+ # Create a qdrant connection
80
+ try:
81
+ load_dotenv(override=True)
82
+ except Exception:
83
+ load_dotenv(sys.path[1] + "/.env", override=True)
84
+ USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
85
+ QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
86
+ QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
87
+ if USE_QDRANT_CLOUD=="True":
88
+ qClient = QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
89
+ else:
90
+ qClient = QdrantClient(url=QDRANT_CLOUD_URL)
91
+
92
+ # Setup the text embedder
93
+ MODEL = "llama3.2"
94
+ try:
95
+ load_dotenv(override=True)
96
+ except Exception:
97
+ load_dotenv(sys.path[1] + "/.env", override=True)
98
+ USE_DOCKER = os.getenv("USE_DOCKER")
99
+ if USE_DOCKER == "True":
100
+ embeddingsModel = OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
101
+ else:
102
+ embeddingsModel = OllamaEmbeddings(model=MODEL)
103
+
104
+ # Search the related collection
105
+ relatedCollection = "Document"
106
+ if codingQuestion == "1":
107
+ relatedCollection = "Github"
108
+ results1 = qClient.search(
109
+ collection_name=relatedCollection,
110
+ query_vector=embeddingsModel.embed_query(query),
111
+ limit=10,
112
+ )
113
+ results2 = qClient.search(
114
+ collection_name=relatedCollection,
115
+ query_vector=embeddingsModel.embed_query(newQuery),
116
+ limit=10,
117
+ )
118
+ return results1, results2
119
+
120
+
121
+ # Collecting results
122
+ @PipelineDecorator.component(cache=False, return_values=["results"])
123
+ def collectingResults(results1, results2):
124
+ return results1 + results2
125
+
126
+
127
+ # Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)
128
+ @PipelineDecorator.component(cache=False, return_values=["topTexts"])
129
+ def reranking(results):
130
+ ids = [result.id for result in results]
131
+ scores = [result.score for result in results]
132
+ topIds = []
133
+ topIndexes = []
134
+ for x in range(3):
135
+ maxScore = 0
136
+ maxIndex = 0
137
+ for i in range(len(ids)):
138
+ if ids[i] not in topIds and scores[i] > maxScore:
139
+ maxScore = scores[i]
140
+ maxIndex = i
141
+ topIds.append(ids[maxIndex])
142
+ topIndexes.append(maxIndex)
143
+ texts = [result.payload["text"] for result in results]
144
+ topTexts = ""
145
+ for index in topIndexes:
146
+ topTexts += texts[index][0]
147
+ return topTexts
148
+
149
+
150
+ # Building prompt
151
+ @PipelineDecorator.component(cache=False, return_values=["prompt"])
152
+ def buildingPrompt(codingQuestion):
153
+ if codingQuestion == "1":
154
+ template = """
155
+ Write code for the following question given the related coding document below.
156
+
157
+ Document: {document}
158
+ Question: {question}
159
+ """
160
+ return PromptTemplate.from_template(template)
161
+ else:
162
+ template = """
163
+ Answer the question based on the document below. If you can't answer the question, reply "I don't know"
164
+
165
+ Document: {document}
166
+ Question: {question}
167
+ """
168
+ return PromptTemplate.from_template(template)
169
+
170
+
171
+ # Obtaining answer
172
+ @PipelineDecorator.component(cache=False, return_values=["answer"])
173
+ def obtainingAnswer(query, prompt, topTexts):
174
+ # Setup the model
175
+ MODEL = "llama3.2"
176
+ try:
177
+ load_dotenv(override=True)
178
+ except Exception:
179
+ load_dotenv(sys.path[1] + "/.env", override=True)
180
+ USE_DOCKER = os.getenv("USE_DOCKER")
181
+ if USE_DOCKER == "True":
182
+ model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
183
+ else:
184
+ model = Ollama(model=MODEL)
185
+
186
+ chain = (
187
+ {"document": itemgetter("document"), "question": itemgetter("question")}
188
+ | prompt
189
+ | model
190
+ )
191
+ chain.invoke({"document": topTexts, "question": query})
192
+
193
+
194
+ # Inference Pipeline
195
+ @PipelineDecorator.pipeline(
196
+ name="Inference Pipeline",
197
+ project="RAG LLM",
198
+ version="0.1",
199
+ )
200
+ def main():
201
+ # User query
202
+ query = "What operating system was ROS written for?"
203
+ newQuery = queryExpansion(query)
204
+ codingQuestion = selfQuerying(query)
205
+ results1, results2 = filteredVectorSearch(query, newQuery, codingQuestion)
206
+ results = collectingResults(results1, results2)
207
+ topTexts = reranking(results)
208
+ prompt = buildingPrompt(codingQuestion)
209
+ return obtainingAnswer(query, prompt, topTexts)
210
+
211
+
212
+ if __name__ == "__main__":
213
+ PipelineDecorator.run_locally()
214
+ main()
project/DataCollectionPipeline.ipynb ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Visiting link: https://github.com/ros2/ros2\n"
13
+ ]
14
+ },
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Cloning into 'ros2'...\n"
20
+ ]
21
+ },
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/README.md\n",
27
+ "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/.gitignore\n",
28
+ "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/CODEOWNERS\n",
29
+ "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
30
+ "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/src/.gitkeep\n"
31
+ ]
32
+ },
33
+ {
34
+ "data": {
35
+ "text/plain": [
36
+ "InsertManyResult([ObjectId('675531b926a728d5b045a2e6'), ObjectId('675531b926a728d5b045a2e7'), ObjectId('675531b926a728d5b045a2e8'), ObjectId('675531b926a728d5b045a2e9'), ObjectId('675531b926a728d5b045a2ea')], acknowledged=True)"
37
+ ]
38
+ },
39
+ "execution_count": 1,
40
+ "metadata": {},
41
+ "output_type": "execute_result"
42
+ }
43
+ ],
44
+ "source": [
45
+ "# See README for more info on how the DataCollectionPipeline works\n",
46
+ "# The ETL pipeline is part of the DataCollectionPipeline\n",
47
+ "# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n",
48
+ "import requests\n",
49
+ "from bs4 import BeautifulSoup\n",
50
+ "import time\n",
51
+ "import os\n",
52
+ "import shutil\n",
53
+ "import subprocess\n",
54
+ "import tempfile\n",
55
+ "from shared import getMongoClient\n",
56
+ "\n",
57
+ "# Input into the Data Collection Pipeline is a list of links to domains\n",
58
+ "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
59
+ "links = ['https://www.ros.org/', 'https://github.com/ros2/ros2']\n",
60
+ "\n",
61
+ "# Create a mongoDB connection\n",
62
+ "mongoHost = getMongoClient()\n",
63
+ "mongoDatabase = mongoHost[\"twin\"]\n",
64
+ "\n",
65
+ "# ETL pipeline\n",
66
+ "# Extract data from links and their subdirectories(using crawlers)\n",
67
+ "documents = []\n",
68
+ "codes = []\n",
69
+ "for link in links:\n",
70
+ " # Web scraper/crawler for github links\n",
71
+ " if \"https://github.com\" in link:\n",
72
+ " # Do not revisit a link already in the database\n",
73
+ " mongoCollection = mongoDatabase[\"Github\"]\n",
74
+ " result = mongoCollection.find_one({\"link\": link})\n",
75
+ " if result is None:\n",
76
+ " print(\"Visiting link: \", link)\n",
77
+ " # Modified GithubCrawler from LLM-Engineer for scraping github\n",
78
+ " local_temp = tempfile.mkdtemp()\n",
79
+ " try:\n",
80
+ " os.chdir(local_temp)\n",
81
+ " subprocess.run([\"git\", \"clone\", link])\n",
82
+ " repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n",
83
+ " tree = {}\n",
84
+ " for root, _, files in os.walk(repo_path):\n",
85
+ " dir = root.replace(repo_path, \"\").lstrip(\"/\")\n",
86
+ " if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
87
+ " continue\n",
88
+ " for file in files:\n",
89
+ " if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
90
+ " continue\n",
91
+ " file_path = os.path.join(dir, file)\n",
92
+ " with open(\n",
93
+ " os.path.join(root, file), \"r\", errors=\"ignore\"\n",
94
+ " ) as f:\n",
95
+ " tree[file_path] = f.read().replace(\" \", \"\")\n",
96
+ " except Exception:\n",
97
+ " print(f\"Error scrapping {link}\")\n",
98
+ " finally:\n",
99
+ " shutil.rmtree(local_temp)\n",
100
+ " # Correct the link\n",
101
+ " r = requests.get(link)\n",
102
+ " soup = BeautifulSoup(r.content, \"html.parser\")\n",
103
+ " # Find the file path to any of the files in the repository\n",
104
+ " link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n",
105
+ " path = link_element.get(\"href\")\n",
106
+ " path = path.rsplit(\"/\", 1)[0]\n",
107
+ " # Push all the subdirectories to mongo\n",
108
+ " for subdirectory in tree:\n",
109
+ " print(\n",
110
+ " f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
111
+ " )\n",
112
+ " text = tree[subdirectory]\n",
113
+ " # Transform the data\n",
114
+ " # Get rid of repeating \\n characters and spaces\n",
115
+ " text = text.replace(\"\\t\", \" \")\n",
116
+ " text = text.replace(\"\\n\", \" \")\n",
117
+ " text_len = len(text)\n",
118
+ " for i in range(text_len):\n",
119
+ " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
120
+ " text = text[:i] + text[i + 1 :]\n",
121
+ " text_len -= 1\n",
122
+ " codes.append(\n",
123
+ " {\n",
124
+ " \"link\": \"https://github.com\"\n",
125
+ " + path\n",
126
+ " + \"/\"\n",
127
+ " + subdirectory,\n",
128
+ " \"type\": \"Github\",\n",
129
+ " \"content\": text,\n",
130
+ " }\n",
131
+ " )\n",
132
+ " else:\n",
133
+ " print(\"Already visited: \", link)\n",
134
+ " # Web scraper/crawler for other links(Documents)\n",
135
+ " else:\n",
136
+ " # Do not revisit a link already in the database\n",
137
+ " mongoCollection = mongoDatabase[\"Document\"]\n",
138
+ " result = mongoCollection.find_one({\"link\": link})\n",
139
+ " if result is None:\n",
140
+ " # Get all text in the website\n",
141
+ " r = requests.get(link)\n",
142
+ " soup = BeautifulSoup(r.content, \"html.parser\")\n",
143
+ " soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
144
+ " text = soup.get_text()\n",
145
+ " # Transform the data\n",
146
+ " # Get rid of repeating \\n characters and spaces\n",
147
+ " text = text.replace(\"\\t\", \" \")\n",
148
+ " text = text.replace(\"\\n\", \" \")\n",
149
+ " text_len = len(text)\n",
150
+ " for i in range(text_len):\n",
151
+ " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
152
+ " text = text[:i] + text[i + 1 :]\n",
153
+ " text_len -= 1\n",
154
+ " documents.append({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
155
+ " # Also crawl through all subdirectorys in the link(related links)\n",
156
+ " soup = BeautifulSoup(r.content, \"html.parser\")\n",
157
+ " subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
158
+ " for subdirectory in subdirectories:\n",
159
+ " if (\n",
160
+ " subdirectory is not None\n",
161
+ " and mongoCollection.find_one({\"link\": link + subdirectory})\n",
162
+ " is not None\n",
163
+ " ):\n",
164
+ " print(\"Adding subdirectory: \", link + subdirectory)\n",
165
+ " links.append(link + subdirectory)\n",
166
+ " else:\n",
167
+ " print(\"Already visited: \", link)\n",
168
+ " # Avoid spamming sites\n",
169
+ " time.sleep(1)\n",
170
+ "# Each document has a link, type(github or other), and content(text)\n",
171
+ "# You can go to Tools/mongoTools to view the inserted documents\n",
172
+ "mongoCollection = mongoDatabase[\"Document\"]\n",
173
+ "mongoCollection.insert_many(documents)\n",
174
+ "mongoCollection = mongoDatabase[\"Github\"]\n",
175
+ "mongoCollection.insert_many(codes)"
176
+ ]
177
+ }
178
+ ],
179
+ "metadata": {
180
+ "kernelspec": {
181
+ "display_name": "Python 3",
182
+ "language": "python",
183
+ "name": "python3"
184
+ },
185
+ "language_info": {
186
+ "codemirror_mode": {
187
+ "name": "ipython",
188
+ "version": 3
189
+ },
190
+ "file_extension": ".py",
191
+ "mimetype": "text/x-python",
192
+ "name": "python",
193
+ "nbconvert_exporter": "python",
194
+ "pygments_lexer": "ipython3",
195
+ "version": "3.12.7"
196
+ }
197
+ },
198
+ "nbformat": 4,
199
+ "nbformat_minor": 2
200
+ }
project/Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.9
2
+
3
+ WORKDIR /gradio-app
4
+ COPY . .
5
+ RUN pip install --no-cache-dir python-dotenv==1.0.1 langchain==0.3.10 langchain-community==0.3.10 \
6
+ qdrant-client==1.12.1 gradio==5.8.0 pymongo==4.10.1 langchain-openai==0.2.11
7
+
8
+ EXPOSE 7860
9
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
10
+
11
+ CMD ["python", "app.py"]
project/FeaturePipeline.ipynb ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/workspaces/RAG_LLM/project/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
13
+ " return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "# See README for more info on how the FeaturePipeline works\n",
19
+ "# The Ingestion pipeline is part of the FeaturePipeline\n",
20
+ "# Make sure to ollama serve before running!\n",
21
+ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
22
+ "from qdrant_client.http.models import Distance, VectorParams, PointStruct\n",
23
+ "from shared import getMongoClient, getQdrantClient, getEmbeddingsModel\n",
24
+ "\n",
25
+ "# Create a mongoDB connection\n",
26
+ "mongoHost = getMongoClient()\n",
27
+ "\n",
28
+ "# Create a qdrant connection\n",
29
+ "qClient = getQdrantClient()\n",
30
+ "\n",
31
+ "# Create qdrant collections to store embeddings\n",
32
+ "if not qClient.collection_exists(\"Github\"):\n",
33
+ " qClient.create_collection(\n",
34
+ " collection_name=\"Github\",\n",
35
+ " vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
36
+ " )\n",
37
+ "if not qClient.collection_exists(\"Document\"):\n",
38
+ " qClient.create_collection(\n",
39
+ " collection_name=\"Document\",\n",
40
+ " vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
41
+ " )\n",
42
+ "\n",
43
+ "# Ingestion Pipeline Setup\n",
44
+ "# Define a text cleaner\n",
45
+ "def cleanText(text):\n",
46
+ " return ''.join(char for char in text if 32 <= ord(char) <= 126)\n",
47
+ "\n",
48
+ "# Setup the text chunker\n",
49
+ "text_splitter = RecursiveCharacterTextSplitter(\n",
50
+ " chunk_size=500,\n",
51
+ " chunk_overlap=20,\n",
52
+ " length_function=len,\n",
53
+ " is_separator_regex=False,\n",
54
+ ")\n",
55
+ "\n",
56
+ "# Setup the text embedder\n",
57
+ "embeddingsModel = getEmbeddingsModel()\n",
58
+ "\n",
59
+ "# Running the ingestion pipeline\n",
60
+ "# Store all documents from each MongoDB collection into qdrant\n",
61
+ "mongoDatabase = mongoHost[\"twin\"]\n",
62
+ "collections = mongoDatabase.list_collection_names()\n",
63
+ "for collection in collections:\n",
64
+ " mongoCollection = mongoDatabase[collection]\n",
65
+ "\n",
66
+ " documents = mongoCollection.find()\n",
67
+ " id = 0\n",
68
+ " for document in documents:\n",
69
+ " # For each document, split it into chunks\n",
70
+ " link = document[\"link\"]\n",
71
+ " resultType = document[\"type\"]\n",
72
+ " text = document[\"content\"]\n",
73
+ " text = cleanText(text)\n",
74
+ " chunks = text_splitter.split_text(text)\n",
75
+ " chunkNum = 0\n",
76
+ " for chunk in chunks:\n",
77
+ " # Create embeddings for each chunk, of length 2048 using the embedding model\n",
78
+ " embedding = embeddingsModel.embed_query(chunk)\n",
79
+ " # Store the embedding along with some metadata into the Qdrant vector database\n",
80
+ " qClient.upsert(collection_name=resultType, wait=True, points=[PointStruct(id=id, vector=embedding, payload={\"link\": link, \"type\": resultType, \"chunk\": chunkNum, \"text\": chunk})])\n",
81
+ " chunkNum += 1\n",
82
+ " id += 1\n"
83
+ ]
84
+ }
85
+ ],
86
+ "metadata": {
87
+ "kernelspec": {
88
+ "display_name": "Python 3",
89
+ "language": "python",
90
+ "name": "python3"
91
+ },
92
+ "language_info": {
93
+ "codemirror_mode": {
94
+ "name": "ipython",
95
+ "version": 3
96
+ },
97
+ "file_extension": ".py",
98
+ "mimetype": "text/x-python",
99
+ "name": "python",
100
+ "nbconvert_exporter": "python",
101
+ "pygments_lexer": "ipython3",
102
+ "version": "3.12.7"
103
+ }
104
+ },
105
+ "nbformat": 4,
106
+ "nbformat_minor": 2
107
+ }
project/InferencePipeline.ipynb ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/workspaces/RAG_LLM/project/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
13
+ " return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n",
14
+ "/workspaces/RAG_LLM/project/shared.py:70: LangChainDeprecationWarning: The class `Ollama` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaLLM``.\n",
15
+ " return Ollama(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "# See README for more info on how the DataCollectionPipeline works\n",
21
+ "# The retrieval pipeline is part of the DataCollectionPipeline\n",
22
+ "from shared import getQdrantClient, getEmbeddingsModel, getModel\n",
23
+ "from langchain_community.llms import Ollama\n",
24
+ "from langchain.prompts import PromptTemplate\n",
25
+ "from operator import itemgetter\n",
26
+ "# Create a qdrant connection\n",
27
+ "qClient = getQdrantClient()\n",
28
+ "\n",
29
+ "# Setup the text embedder\n",
30
+ "embeddingsModel = getEmbeddingsModel()\n",
31
+ "\n",
32
+ "# Setup the model\n",
33
+ "model = getModel()\n",
34
+ "\n",
35
+ "# Retrieval Pipeline\n",
36
+ "# Retrieve the chunks with the most similar embeddings from Qdrant\n",
37
+ "def retriever(text, collection):\n",
38
+ " results = qClient.search(\n",
39
+ " collection_name=collection,\n",
40
+ " query_vector = embeddingsModel.embed_query(text),\n",
41
+ " limit=10\n",
42
+ " )\n",
43
+ " return results"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 5,
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "name": "stdout",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Query expansion: Create a user-friendly, community-driven guide that provides an alternative to the traditional ROS documentation, focusing on real-world scenarios and practical applications rather than technical specifications and developer guides.\n",
56
+ "Coding Question?: 1\n",
57
+ "Related Collection: Github\n",
58
+ "Top texts: #About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\n",
59
+ "Link: https://github.com/ros2/ros2/tree/rolling/README.md\n",
60
+ "Top texts: type:git url:https://github.com/ros2/tinyxml2_vendor.git version:rolling ros2/tlsf: type:git url:https://github.com/ros2/tlsf.git version:rolling ros2/unique_identifier_msgs: type:git url:https://github.com/ros2/unique_identifier_msgs.git version:rolling ros2/urdf: type:git url:https://github.com/ros2/urdf.git version:rolling ros2/yaml_cpp_vendor: type:git url:https://github.com/ros2/yaml_cpp_vendor.git version:rolling\n",
61
+ "Link: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
62
+ "Top texts: *[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)\n",
63
+ "Link: https://github.com/ros2/ros2/tree/rolling/README.md\n"
64
+ ]
65
+ },
66
+ {
67
+ "data": {
68
+ "text/plain": [
69
+ "\"Here's an example of what the README file for ROS could look like:\\n\\n**Welcome to the Robot Operating System (ROS)**\\n\\nROS is a set of software libraries and tools that help you build robot applications. From driver development to state-of-the-art algorithms, and with powerful development tools, ROS has everything you need for your next robotics project.\\n\\n### Getting Started\\n\\nTo get started with ROS, check out our [installation guide](https://www.ros.org/blog/getting-started/).\\n\\n### What's Included\\n\\nROS includes a range of open-source projects, including:\\n\\n* **tinyxml2_vendor**: A fork of the tinyxml2 library for parsing XML files.\\n* **tlsf**: A library for secure communication over TLS (Transport Layer Security).\\n* **unique_identifier_msgs**: A package for generating unique identifiers for robots and other entities.\\n* **urdf**: A package for working with URDF (Unified Robot Description Format) files.\\n* **yaml_cpp_vendor**: A fork of the yaml-cpp library for parsing YAML files.\\n\\n### ROS Releases and Target Platforms\\n\\nFor more information on ROS releases, target platforms, and release notes, check out [REP-2000](https://ros.org/reps/rep-2000.html).\\n\\n### Project Resources\\n\\n* **ROSSwag**: Purchase ROS-related merchandise from our online store.\\n* **ROS Trademark Information**: Learn about the ROS trademark.\\n\\n### Get Involved\\n\\nStay up-to-date with the latest news and developments in ROS:\\n\\n* Follow us on [LinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation)\\n* Join our Twitter community: [OpenRoboticsOrg](https://twitter.com/OpenRoboticsOrg), [ROSOrg](https://twitter.com/ROSOrg)\\n\\n### License and Contributions\\n\\nROS is an open-source project, licensed under the Apache 2.0 license.\\n\\nWe welcome contributions from the ROS community! If you have any ideas or bug fixes to contribute, check out our [contribution guidelines](https://ros.org/blog/contribute/).\\n\\n**Thank You**\\n\\nThanks for choosing ROS as your platform for robotics development!\\n\\nYou can modify this README file according to your needs and preferences.\""
70
+ ]
71
+ },
72
+ "execution_count": 5,
73
+ "metadata": {},
74
+ "output_type": "execute_result"
75
+ }
76
+ ],
77
+ "source": [
78
+ "# User query\n",
79
+ "query = \"Can you create a README file for ROS\"\n",
80
+ "\n",
81
+ "# Query expansion(I only generate one additional prompt for simplicity)\n",
82
+ "template = \"\"\"\n",
83
+ "Rewrite the prompt. The new prompt must offer a different perspective.\n",
84
+ "Do not change the meaning. Output only the rewritten prompt with no introduction.\n",
85
+ " Prompt: {prompt}\n",
86
+ "\"\"\"\n",
87
+ "prompt = PromptTemplate.from_template(template)\n",
88
+ "chain = {\"prompt\": itemgetter(\"prompt\")} | prompt | model\n",
89
+ "queryExpansion = chain.invoke({\"prompt\": query})\n",
90
+ "print(\"Query expansion: \", queryExpansion)\n",
91
+ "\n",
92
+ "# Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)\n",
93
+ "template = \"\"\"\n",
94
+ "You are an AI assistant. You must determine if the prompt requires code as the answer.\n",
95
+ "Output a 1 if it is or a 0 if it is not and nothing else.\n",
96
+ " Prompt: {prompt}\n",
97
+ "\"\"\"\n",
98
+ "prompt = PromptTemplate.from_template(template)\n",
99
+ "chain = {\"prompt\": itemgetter(\"prompt\")} | prompt | model\n",
100
+ "codingQuestion = chain.invoke({\"prompt\": query})\n",
101
+ "print(\"Coding Question?: \", codingQuestion)\n",
102
+ "\n",
103
+ "# Filtered vector search for each of the N queries after expansion\n",
104
+ "relatedCollection = 'Document'\n",
105
+ "if (codingQuestion == '1'):\n",
106
+ " relatedCollection = 'Github'\n",
107
+ "print(\"Related Collection: \", relatedCollection)\n",
108
+ "results1 = retriever(query, relatedCollection)\n",
109
+ "results2 = retriever(queryExpansion, relatedCollection)\n",
110
+ "\n",
111
+ "# Collecting results\n",
112
+ "results = results1+results2\n",
113
+ "\n",
114
+ "# Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)\n",
115
+ "ids = [result.id for result in results]\n",
116
+ "scores = [result.score for result in results]\n",
117
+ "topIds = []\n",
118
+ "topIndexes = []\n",
119
+ "for x in range(3):\n",
120
+ " maxScore = 0\n",
121
+ " maxIndex = 0\n",
122
+ " for i in range(len(ids)):\n",
123
+ " if ids[i] not in topIds and scores[i] > maxScore:\n",
124
+ " maxScore = scores[i]\n",
125
+ " maxIndex = i\n",
126
+ " topIds.append(ids[maxIndex])\n",
127
+ " topIndexes.append(maxIndex)\n",
128
+ "texts = [result.payload['text'] for result in results]\n",
129
+ "links = [result.payload['link'] for result in results]\n",
130
+ "topTexts = ''\n",
131
+ "for index in topIndexes:\n",
132
+ " print(\"Top texts: \", texts[index])\n",
133
+ " print(\"Link: \", links[index])\n",
134
+ " topTexts += texts[index]\n",
135
+ "\n",
136
+ "# Building prompt\n",
137
+ "if(codingQuestion == '1'):\n",
138
+ " template = \"\"\"\n",
139
+ " Write code for the following question given the related coding document below.\n",
140
+ "\n",
141
+ " Document: {document}\n",
142
+ " Question: {question}\n",
143
+ " \"\"\"\n",
144
+ " prompt = PromptTemplate.from_template(template)\n",
145
+ "else:\n",
146
+ " template = \"\"\"\n",
147
+ " Answer the question based on the document below. If you can't answer the question, reply \"I don't know\"\n",
148
+ "\n",
149
+ " Document: {document}\n",
150
+ " Question: {question}\n",
151
+ " \"\"\"\n",
152
+ " prompt = PromptTemplate.from_template(template)\n",
153
+ "\n",
154
+ "# Obtaining answer\n",
155
+ "chain = {\"document\": itemgetter(\"document\"), \"question\": itemgetter(\"question\")} | prompt | model\n",
156
+ "chain.invoke({\"document\": topTexts, \"question\": query})"
157
+ ]
158
+ }
159
+ ],
160
+ "metadata": {
161
+ "kernelspec": {
162
+ "display_name": "Python 3",
163
+ "language": "python",
164
+ "name": "python3"
165
+ },
166
+ "language_info": {
167
+ "codemirror_mode": {
168
+ "name": "ipython",
169
+ "version": 3
170
+ },
171
+ "file_extension": ".py",
172
+ "mimetype": "text/x-python",
173
+ "name": "python",
174
+ "nbconvert_exporter": "python",
175
+ "pygments_lexer": "ipython3",
176
+ "version": "3.12.7"
177
+ }
178
+ },
179
+ "nbformat": 4,
180
+ "nbformat_minor": 2
181
+ }
project/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1>Installation:</h1>
2
+ <h3>Docker setup(easy):<h3>
3
+
4
+ * Clone the repository from huggingface
5
+ * Reopen the repository in a dev container
6
+ * Copy the .env.example into a new .env file in the project folder
7
+ * If you want to run files in the ClearML folder, fill out the ClearML env variables, otherwise no changes needed.
8
+ * Open a shell on the host machine(not the dev container) and navigate to the project folder
9
+ * Run "docker compose up -d"
10
+ * Run "docker exec -it ollama ollama pull llama3.2"
11
+ * Select the python 3.12.7 kernels for the notebooks and run DataCollectionPipeline.ipynb and FeaturePipeline.ipynb(to populate the mongodb and qdrant databases)
12
+ * The app is available on localhost:7860
13
+
14
+ <h3>Non-Docker(web based) setup:<h3>
15
+
16
+ If for some reason the docker setup does not work try connecting to mongodb, qdrant, ollama, and gradio from the web:
17
+ * Clone the repository from huggingface or the entire repository from github
18
+ * Reopen the repository in a dev container
19
+ * Copy the .env.example into a new .env file in the project folder
20
+ * Modify the .env file as instructed in the comments(create accounts for each website)
21
+ * Install ollama in the dev container
22
+ * curl -fsSL https://ollama.com/install.sh | sh
23
+ * Start up ollama
24
+ * ollama serve
25
+ * Download llama3.2(in a new dev container terminal)
26
+ * ollama pull llama3.2
27
+ * Select the python 3.12.7 kernels for the notebooks and run DataCollectionPipeline.ipynb and FeaturePipeline.ipynb(to populate the mongodb and qdrant databases)
28
+ * Run app.py and click on the link
29
+
30
+ <h1>Project infrastructure</h1>
31
+
32
+ Note some files may have similar code with other files, such as the ClearML files containing ipynb files rewritten in python in order to work in ClearML or gradio containing code from InferencePipeline.ipynb. The ipynb file prints output to help see what is happening.
33
+
34
+ # app.py
35
+ Sends a query to the inference pipeline to generate an answer. The DataCollectionPipeline.ipynb and FeaturePipeline.ipynb files must be run first to populate the databases.
36
+
37
+ # Data Collection Pipeline
38
+ The Data Collection pipeline takes as input a list of links to domains. The links are fed into the ETL pipeline which Extracts data from the links using a crawler, Transforms the data into a standardized format, and Loads the extracted data into a NoSQL data warehouse, which in this case is MongoDB. The ETL pipeline uses a different method of extracting and transforming based on the link type. In this project, I classify links as either a github repository or document each with their own crawler and cleaner. This raw data is used by the feature pipeline.
39
+
40
+ # Feature Pipeline
41
+ The Feature pipeline contains the ingestion pipeline.
42
+ * The ingestion pipeline extracts documents from MongoDB that were stored by the Data Collection Pipeline. It further cleans the data, breaks it into chunks depending on the data category, passes the chunks through an embedding model to generate embeddings, then loads the embeddings plus their metadata into a vector database, which in this case is Qdrant. The embeddings are passed with additional metadata that contains the document link, type, chunk number, and content.
43
+
44
+ # Training Pipeline
45
+ The training pipeline performs finetuning. I skipped this step since it was not required.
46
+
47
+ # Inference Pipeline
48
+ The inference pipeline contains the retrieval client/pipeline.
49
+ * The retreival client takes a prompt as input. It uses the same embedding model as the ingestion pipeline in order to create an embedding for the prompt. It then queries the Qdrant database for the 10 closest embeddings using cosine distance and extracts the text chunk stored in the embeddings' metadata. This returns chunks that are related to the prompt.
50
+ * The inference pipeline takes a query as input. It expands the query into N=2 queries using a prompt template, performs self-querying to extract metadata (document type) from the original query, searches the Qdrant for K=10 relevant chunks to each of the N=2 queries plus metadata using the retrieval client, combines the K=10 results from each of the N=2 queries, filters out only the most relevant 3 results, prompts the LLM with the results as context, and generates an answer.
51
+
52
+ # ClearML
53
+ The ClearML folder contains the notebook (.ipynb) pipeline files rewritten to work with ClearML. It is similar code to the notebooks, however ClearML does not print any output but instead logs all output in website. The website stores the pipelines which take input and produces output stored in artifacts. These are the differences between the notebook(.ipynb) pipeline files and the ClearML pipeline files(.py):
54
+ * The ClearML Data Collection Pipeline works the same way, running the entire ETL pipeline in a single step (I could not split the ETL pipeline into 3 steps (Extract, Transform, Load) since my list of links gets bigger while looping through it(Since I also goes through some links inside of the websites crawled). Breaking it into steps would require more HTTP requests which would greatly slow down the pipeline).
55
+ * The Feature Pipeline breaks down the notebook's loop (from the ingestion pipeline) into 5 stages: retrieve documents, clean documents, chunk documents, embed chunks, and store embeddings.
56
+ * The Inference Pipeline simply puts each step in the notebook's version into a function. These functions are query expansion, self-querying, filtered vector search, collecting results, reranking, building prompt, and obtaining answer.
57
+
58
+ # Tools
59
+ The tools folder contains code for viewing/deleting what has been stored in MongoDB and Qdrant
60
+
61
+ # shared.py
62
+ shared.py is in both the project folder and project/Tools folder. It contains functions for setting up the connections with either the docker containers or web services. If you are running into errors connecting to any of the services, consider editing this file or double checking the .env file. Note the ClearML folder hardcodes all functions since it had trouble importing code.
project/Tools/QdrantTools.ipynb ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None\n",
13
+ "id=1 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 1, 'text': '24.04 (Noble) and Windows 10, though other systems are supported to varying degrees. Learn More Humble Hawksbill ROS 2 Humble Hawksbill is a slighly older LTS release of ROS 2 targeted at Ubuntu 22.04 (Jammy) and Windows 10. Other systems are supported including tier 3 support for 20.04 for those transitioning from ROS 1. Learn More Support There are several mechanisms in place to support the ROS community, each with its own purpose. Documentation Documentation and tutorials for ROS 2 Stack'} vector=None shard_key=None order_value=None\n",
14
+ "id=2 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n",
15
+ "id=3 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 3, 'text': 'Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS'} vector=None shard_key=None order_value=None\n",
16
+ "id=4 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
17
+ "Number of document chunks: 5\n",
18
+ "\n",
19
+ "Sample document chunk(metadata not the vector): \n",
20
+ "id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None \n",
21
+ "\n",
22
+ "id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None\n",
23
+ "id=1 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 1, 'text': \"Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum).\"} vector=None shard_key=None order_value=None\n",
24
+ "id=2 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 2, 'text': '*[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/)'} vector=None shard_key=None order_value=None\n",
25
+ "id=3 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 3, 'text': '*[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)'} vector=None shard_key=None order_value=None\n",
26
+ "id=4 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 4, 'text': 'ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode)'} vector=None shard_key=None order_value=None\n",
27
+ "id=5 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'type': 'Github', 'chunk': 0, 'text': '#Ignoredefaultnamesforcolconcreatedfolders build install log #Ignoreeverythinginsrcexcepta.gitkeepfile src/* !src/.gitkeep'} vector=None shard_key=None order_value=None\n",
28
+ "id=6 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'type': 'Github', 'chunk': 0, 'text': '#Thisfilewasgeneratedbyhttps://github.com/audrow/update-ros2-repos *@clalancette@codebot'} vector=None shard_key=None order_value=None\n",
29
+ "id=7 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 0, 'text': 'repositories: ament/ament_cmake: type:git url:https://github.com/ament/ament_cmake.git version:rolling ament/ament_index: type:git url:https://github.com/ament/ament_index.git version:rolling ament/ament_lint: type:git url:https://github.com/ament/ament_lint.git version:rolling ament/ament_package: type:git url:https://github.com/ament/ament_package.git version:rolling ament/google_benchmark_vendor: type:git url:https://github.com/ament/google_benchmark_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
30
+ "id=8 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 1, 'text': 'version:rolling ament/googletest: type:git url:https://github.com/ament/googletest.git version:rolling ament/uncrustify_vendor: type:git url:https://github.com/ament/uncrustify_vendor.git version:rolling eProsima/Fast-CDR: type:git url:https://github.com/eProsima/Fast-CDR.git version:2.2.x eProsima/Fast-DDS: type:git url:https://github.com/eProsima/Fast-DDS.git version:2.14.x eProsima/foonathan_memory_vendor: type:git url:https://github.com/eProsima/foonathan_memory_vendor.git version:master'} vector=None shard_key=None order_value=None\n",
31
+ "id=9 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 2, 'text': 'version:master eclipse-cyclonedds/cyclonedds: type:git url:https://github.com/eclipse-cyclonedds/cyclonedds.git version:releases/0.10.x eclipse-iceoryx/iceoryx: type:git url:https://github.com/eclipse-iceoryx/iceoryx.git version:release_2.0 gazebo-release/gz_cmake_vendor: type:git url:https://github.com/gazebo-release/gz_cmake_vendor.git version:rolling gazebo-release/gz_math_vendor: type:git url:https://github.com/gazebo-release/gz_math_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
32
+ "\n",
33
+ "Number of Github chunks: 10\n",
34
+ "\n",
35
+ "Sample Github chunk(metadata not the vector): \n",
36
+ "id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None \n",
37
+ "\n"
38
+ ]
39
+ },
40
+ {
41
+ "name": "stderr",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "/workspaces/RAG_LLM/project/Tools/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
45
+ " return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
46
+ ]
47
+ },
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "\n",
53
+ "Sample search result(n=2): \n",
54
+ "id=4 version=4 score=0.38799083 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
55
+ "id=2 version=2 score=0.35047314 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "from shared import getQdrantClient, getEmbeddingsModel\n",
61
+ "qClient = getQdrantClient()\n",
62
+ "\n",
63
+ "# Show everything in the Document collection\n",
64
+ "numDocumentChunks = 0\n",
65
+ "# Note with_vectors defaults to false, so the vectors are not returned\n",
66
+ "chunks = qClient.scroll(collection_name='Document')\n",
67
+ "#print(chunks)\n",
68
+ "for chunk in chunks[0]:\n",
69
+ " # Only display chunks if vector database is small\n",
70
+ " print(chunk)\n",
71
+ " if numDocumentChunks == 0:\n",
72
+ " sampleDocumentChunk = chunk\n",
73
+ " numDocumentChunks += 1\n",
74
+ "print(\"Number of document chunks: \", numDocumentChunks)\n",
75
+ "if numDocumentChunks > 0:\n",
76
+ " print(\"\\nSample document chunk(metadata not the vector): \")\n",
77
+ " print(sampleDocumentChunk, '\\n')\n",
78
+ "\n",
79
+ "# Show everything in the Github collection\n",
80
+ "numGithubChunks = 0\n",
81
+ "chunks = qClient.scroll(collection_name='Github')\n",
82
+ "#print(chunks)\n",
83
+ "for chunk in chunks[0]:\n",
84
+ " # Only display chunks if vector database is small\n",
85
+ " print(chunk)\n",
86
+ " if numGithubChunks == 0:\n",
87
+ " sampleGithubChunk = chunk\n",
88
+ " numGithubChunks += 1\n",
89
+ "print(\"\\nNumber of Github chunks: \", numGithubChunks)\n",
90
+ "if numGithubChunks > 0:\n",
91
+ " print(\"\\nSample Github chunk(metadata not the vector): \")\n",
92
+ " print(sampleGithubChunk, '\\n')\n",
93
+ "\n",
94
+ "# Show a sample search\n",
95
+ "embeddingsModel = getEmbeddingsModel()\n",
96
+ "results = qClient.search(\n",
97
+ " collection_name=\"Document\",\n",
98
+ " query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
99
+ " limit=2\n",
100
+ ")\n",
101
+ "print(\"\\nSample search result(n=2): \")\n",
102
+ "for result in results:\n",
103
+ " print(result)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 12,
109
+ "metadata": {},
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "Cosine Similarity for related sentences: 0.7035977848391597\n",
116
+ "Cosine Similarity for unrelated sentences: 0.3566534327076298\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "import numpy as np\n",
122
+ "# How cosine distance works\n",
123
+ "\n",
124
+ "embedding1 = embeddingsModel.embed_query(\"What is the weather like?\")\n",
125
+ "embedding2 = embeddingsModel.embed_query(\"It is raining today.\")\n",
126
+ "embedding3 = embeddingsModel.embed_query(\"ROS is an open source platform\")\n",
127
+ "def cosine_similarity(vec1, vec2):\n",
128
+ " dot_product = np.dot(vec1, vec2)\n",
129
+ " norm_vec1 = np.linalg.norm(vec1)\n",
130
+ " norm_vec2 = np.linalg.norm(vec2)\n",
131
+ " return dot_product / (norm_vec1 * norm_vec2)\n",
132
+ "similarity1 = cosine_similarity(embedding1, embedding2)\n",
133
+ "similarity2 = cosine_similarity(embedding1, embedding3)\n",
134
+ "print(\"Cosine Similarity for related sentences:\", similarity1)\n",
135
+ "print(\"Cosine Similarity for unrelated sentences:\", similarity2)"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 20,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/plain": [
146
+ "True"
147
+ ]
148
+ },
149
+ "execution_count": 20,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "from qdrant_client.http.models import Distance, VectorParams\n",
156
+ "# Delete all collections and vectors inside them\n",
157
+ "qClient.delete_collection(collection_name = \"Document\")\n",
158
+ "qClient.delete_collection(collection_name = \"Github\")\n",
159
+ "# Recreate the empty collections\n",
160
+ "qClient.create_collection(\n",
161
+ " collection_name = \"Document\",\n",
162
+ " vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
163
+ ")\n",
164
+ "qClient.create_collection(\n",
165
+ " collection_name = \"Github\",\n",
166
+ " vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
167
+ ")"
168
+ ]
169
+ }
170
+ ],
171
+ "metadata": {
172
+ "kernelspec": {
173
+ "display_name": "Python 3",
174
+ "language": "python",
175
+ "name": "python3"
176
+ },
177
+ "language_info": {
178
+ "codemirror_mode": {
179
+ "name": "ipython",
180
+ "version": 3
181
+ },
182
+ "file_extension": ".py",
183
+ "mimetype": "text/x-python",
184
+ "name": "python",
185
+ "nbconvert_exporter": "python",
186
+ "pygments_lexer": "ipython3",
187
+ "version": "3.12.7"
188
+ }
189
+ },
190
+ "nbformat": 4,
191
+ "nbformat_minor": 2
192
+ }
project/Tools/__pycache__/shared.cpython-312.pyc ADDED
Binary file (4.07 kB). View file
 
project/Tools/mongoTools.ipynb ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Number of regular documents: 1\n",
13
+ "Number of github documents: 5\n",
14
+ "Links crawled: ['https://www.ros.org/', 'https://github.com/ros2/ros2/tree/rolling/README.md', 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'https://github.com/ros2/ros2/tree/rolling/src/.gitkeep']\n",
15
+ "Sample regular document: {'_id': ObjectId('675531b926a728d5b045a2e5'), 'link': 'https://www.ros.org/', 'type': 'Document', 'content': ' ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and Windows 10, though other systems are supported to varying degrees. Learn More Humble Hawksbill ROS 2 Humble Hawksbill is a slighly older LTS release of ROS 2 targeted at Ubuntu 22.04 (Jammy) and Windows 10. Other systems are supported including tier 3 support for 20.04 for those transitioning from ROS 1. Learn More Support There are several mechanisms in place to support the ROS community, each with its own purpose. Documentation Documentation and tutorials for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | ©2021 Open Robotics '}\n",
16
+ "Sample github document {'_id': ObjectId('675531b926a728d5b045a2e6'), 'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'content': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/). Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum). *[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/) *[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg) ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode) \"}\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "# Shows the state of the mongo database\n",
22
+ "from shared import getMongoClient\n",
23
+ "mongoHost = getMongoClient()\n",
24
+ "mongoDatabase = mongoHost[\"twin\"]\n",
25
+ "mongoDocumentCollection = mongoDatabase[\"Document\"]\n",
26
+ "mongoGithubCollection = mongoDatabase[\"Github\"]\n",
27
+ "documents = mongoDocumentCollection.find()\n",
28
+ "codes = mongoGithubCollection.find()\n",
29
+ "numDocuments = 0\n",
30
+ "numCodes = 0\n",
31
+ "links = []\n",
32
+ "for document in documents:\n",
33
+ " links.append(document[\"link\"])\n",
34
+ " if numDocuments == 0:\n",
35
+ " sampleDocument = document\n",
36
+ " numDocuments += 1\n",
37
+ "for code in codes:\n",
38
+ " links.append(code[\"link\"])\n",
39
+ " if numCodes == 0:\n",
40
+ " sampleCode = code\n",
41
+ " numCodes += 1\n",
42
+ "print(\"Number of regular documents: \", numDocuments)\n",
43
+ "print(\"Number of github documents: \", numCodes)\n",
44
+ "print(\"Links crawled: \", links)\n",
45
+ "if (numDocuments > 0):\n",
46
+ " print(\"Sample regular document: \", sampleDocument)\n",
47
+ "else:\n",
48
+ " print(\"No documents\")\n",
49
+ "if (numCodes > 0):\n",
50
+ " print(\"Sample github document\", sampleCode)\n",
51
+ "else:\n",
52
+ " print(\"No github documents\")"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 6,
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "data": {
62
+ "text/plain": [
63
+ "DeleteResult({'n': 18, 'electionId': ObjectId('7fffffff000000000000016a'), 'opTime': {'ts': Timestamp(1733585625, 33), 't': 362}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1733585625, 33), 'signature': {'hash': b'\\xd0\\xa2\\xaf\\x1c?p\\xc5\\xd7\\x9a\\x1e\\x1f\\x15\\x1ews\\xdc\\xab)\\xf2B', 'keyId': 7395232362797203469}}, 'operationTime': Timestamp(1733585625, 33)}, acknowledged=True)"
64
+ ]
65
+ },
66
+ "execution_count": 6,
67
+ "metadata": {},
68
+ "output_type": "execute_result"
69
+ }
70
+ ],
71
+ "source": [
72
+ "# Delete all files\n",
73
+ "mongoDocumentCollection.delete_many({})\n",
74
+ "mongoGithubCollection.delete_many({})"
75
+ ]
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "kernelspec": {
80
+ "display_name": "Python 3",
81
+ "language": "python",
82
+ "name": "python3"
83
+ },
84
+ "language_info": {
85
+ "codemirror_mode": {
86
+ "name": "ipython",
87
+ "version": 3
88
+ },
89
+ "file_extension": ".py",
90
+ "mimetype": "text/x-python",
91
+ "name": "python",
92
+ "nbconvert_exporter": "python",
93
+ "pygments_lexer": "ipython3",
94
+ "version": "3.12.7"
95
+ }
96
+ },
97
+ "nbformat": 4,
98
+ "nbformat_minor": 2
99
+ }
project/Tools/shared.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains shared functions used by multiple files
2
+ import os
3
+ import sys
4
+
5
+ import pymongo
6
+ from dotenv import load_dotenv
7
+ from langchain_community.embeddings import OllamaEmbeddings
8
+ from langchain_openai.chat_models import ChatOpenAI
9
+ from qdrant_client import QdrantClient
10
+ from langchain_community.llms import Ollama
11
+
12
+
13
+ # Unused since usage limit reached since years ago...rip
14
+ def getOpenAiModel():
15
+ MODEL = "gpt-3.5-turbo"
16
+ try:
17
+ load_dotenv(override=True)
18
+ except Exception:
19
+ load_dotenv(sys.path[1] + "/.env", override=True)
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+ return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
22
+
23
+
24
+ # Create a mongoDB connection
25
+ def getMongoClient():
26
+ try:
27
+ load_dotenv(override=True)
28
+ except Exception:
29
+ load_dotenv(sys.path[1] + "/.env", override=True)
30
+ DATABASE_HOST = os.getenv("DATABASE_HOST")
31
+ return pymongo.MongoClient(DATABASE_HOST)
32
+
33
+
34
+ # Create a qdrant connection
35
+ def getQdrantClient():
36
+ try:
37
+ load_dotenv(override=True)
38
+ except Exception:
39
+ load_dotenv(sys.path[1] + "/.env", override=True)
40
+ USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
41
+ QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
42
+ QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
43
+ if USE_QDRANT_CLOUD:
44
+ return QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
45
+ else:
46
+ return QdrantClient(url=QDRANT_CLOUD_URL)
47
+
48
+
49
+ # Setup the text embedder
50
+ def getEmbeddingsModel(MODEL="llama3.2"):
51
+ try:
52
+ load_dotenv(override=True)
53
+ except Exception:
54
+ load_dotenv(sys.path[1] + "/.env", override=True)
55
+ USE_DOCKER = os.getenv("USE_DOCKER")
56
+ if USE_DOCKER == "True":
57
+ return OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
58
+ else:
59
+ return OllamaEmbeddings(model=MODEL)
60
+
61
+
62
+ # Setup the model
63
+ def getModel(MODEL="llama3.2"):
64
+ try:
65
+ load_dotenv(override=True)
66
+ except Exception:
67
+ load_dotenv(sys.path[1] + "/.env", override=True)
68
+ USE_DOCKER = os.getenv("USE_DOCKER")
69
+ if USE_DOCKER == "True":
70
+ return Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
71
+ else:
72
+ return Ollama(model=MODEL)
73
+
74
+
75
+ # Setup clearML
76
+ def setupClearML():
77
+ try:
78
+ load_dotenv(override=True)
79
+ except Exception:
80
+ load_dotenv(sys.path[1] + "/.env", override=True)
81
+ CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
82
+ CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
83
+ CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
84
+ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
85
+ CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
86
+ return (
87
+ CLEARML_WEB_HOST,
88
+ CLEARML_API_HOST,
89
+ CLEARML_FILES_HOST,
90
+ CLEARML_API_ACCESS_KEY,
91
+ CLEARML_API_SECRET_KEY,
92
+ )
project/TrainingPipeline.ipynb ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Fine-tuning not required"
10
+ ]
11
+ }
12
+ ],
13
+ "metadata": {
14
+ "kernelspec": {
15
+ "display_name": ".venv",
16
+ "language": "python",
17
+ "name": "python3"
18
+ },
19
+ "language_info": {
20
+ "codemirror_mode": {
21
+ "name": "ipython",
22
+ "version": 3
23
+ },
24
+ "file_extension": ".py",
25
+ "mimetype": "text/x-python",
26
+ "name": "python",
27
+ "nbconvert_exporter": "python",
28
+ "pygments_lexer": "ipython3",
29
+ "version": "3.11.9"
30
+ }
31
+ },
32
+ "nbformat": 4,
33
+ "nbformat_minor": 2
34
+ }
project/__pycache__/shared.cpython-312.pyc ADDED
Binary file (4.08 kB). View file
 
project/app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Make sure you have run "ollama serve"
2
+ # This is the same code as ClearML
3
+ import os
4
+ import sys
5
+ from operator import itemgetter
6
+
7
+ import gradio as gr
8
+ from dotenv import load_dotenv
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_community.embeddings import OllamaEmbeddings
11
+ from langchain_community.llms import Ollama
12
+ from qdrant_client import QdrantClient
13
+ from shared import getModel, getEmbeddingsModel, getQdrantClient
14
+
15
+ def answer(samplePrompt, useSample, Query):
16
+ if useSample:
17
+ query = samplePrompt
18
+ else:
19
+ query = Query
20
+ # Create a qdrant connection
21
+ qClient = getQdrantClient()
22
+
23
+ # Setup the text embedder
24
+ embeddingsModel = getEmbeddingsModel()
25
+
26
+ # Setup the model
27
+ model = getModel()
28
+
29
+ # Retrieval Pipeline
30
+ # Retrieve the chunks with the most similar embeddings from Qdrant
31
+ def retriever(text, collection):
32
+ results = qClient.search(
33
+ collection_name=collection,
34
+ query_vector = embeddingsModel.embed_query(text),
35
+ limit=10
36
+ )
37
+ return results
38
+
39
+ # Query expansion(I only generate one additional prompt for simplicity)
40
+ template = """
41
+ Rewrite the prompt. The new prompt must offer a different perspective.
42
+ Do not change the meaning. Output only the rewritten prompt with no introduction.
43
+ Prompt: {prompt}
44
+ """
45
+ prompt = PromptTemplate.from_template(template)
46
+ chain = {"prompt": itemgetter("prompt")} | prompt | model
47
+ queryExpansion = chain.invoke({"prompt": query})
48
+
49
+ # Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)
50
+ template = """
51
+ You are an AI assistant. You must determine if the prompt requires code as the answer.
52
+ Output a 1 if it is or a 0 if it is not and nothing else.
53
+ Prompt: {prompt}
54
+ """
55
+ prompt = PromptTemplate.from_template(template)
56
+ chain = {"prompt": itemgetter("prompt")} | prompt | model
57
+ codingQuestion = chain.invoke({"prompt": query})
58
+
59
+ # Filtered vector search for each of the N queries after expansion
60
+ relatedCollection = 'Document'
61
+ if (codingQuestion == '1'):
62
+ relatedCollection = 'Github'
63
+ results1 = retriever(query, relatedCollection)
64
+ results2 = retriever(queryExpansion, relatedCollection)
65
+
66
+ # Collecting results
67
+ results = results1+results2
68
+
69
+ # Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)
70
+ ids = [result.id for result in results]
71
+ scores = [result.score for result in results]
72
+ topIds = []
73
+ topIndexes = []
74
+ for x in range(3):
75
+ maxScore = 0
76
+ maxIndex = 0
77
+ for i in range(len(ids)):
78
+ if ids[i] not in topIds and scores[i] > maxScore:
79
+ maxScore = scores[i]
80
+ maxIndex = i
81
+ topIds.append(ids[maxIndex])
82
+ topIndexes.append(maxIndex)
83
+ texts = [result.payload['text'] for result in results]
84
+ links = [result.payload['link'] for result in results]
85
+ topTexts = ''
86
+ for index in topIndexes:
87
+ print("Top texts: ", texts[index])
88
+ print("Link: ", links[index])
89
+ topTexts += texts[index]
90
+
91
+ # Building prompt
92
+ if(codingQuestion == '1'):
93
+ template = """
94
+ Write code for the following question given the related coding document below.
95
+
96
+ Document: {document}
97
+ Question: {question}
98
+ """
99
+ prompt = PromptTemplate.from_template(template)
100
+ else:
101
+ template = """
102
+ Answer the question based on the document below. If you can't answer the question, reply "I don't know"
103
+
104
+ Document: {document}
105
+ Question: {question}
106
+ """
107
+ prompt = PromptTemplate.from_template(template)
108
+
109
+ # Obtaining answer
110
+ chain = {"document": itemgetter("document"), "question": itemgetter("question")} | prompt | model
111
+ return chain.invoke({"document": topTexts, "question": query})
112
+
113
+
114
+ demo = gr.Interface(
115
+ fn=answer,
116
+ inputs=[
117
+ gr.Dropdown(
118
+ ["What is ROS?", "Write me code to move a robot"], label="Sample Prompt"
119
+ ),
120
+ "checkbox",
121
+ "text",
122
+ ],
123
+ outputs=["text"],
124
+ )
125
+
126
+ demo.launch(share=False)
project/docker-compose.yml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ ollama:
3
+ container_name: ollama
4
+ image: ollama/ollama:latest
5
+ deploy:
6
+ resources:
7
+ reservations:
8
+ devices:
9
+ - driver: nvidia
10
+ capabilities: ["gpu"]
11
+ count: all
12
+ ports:
13
+ - 11434:11434
14
+ expose:
15
+ - 11434
16
+ volumes:
17
+ - ollama:/root/.ollama
18
+ gradio:
19
+ build:
20
+ context: .
21
+ dockerfile: Dockerfile
22
+ ports:
23
+ - 7860:7860
24
+ expose:
25
+ - 7860
26
+ networks:
27
+ - net
28
+ environment:
29
+ - GRADIO_SERVER_NAME=0.0.0.0
30
+ - GRADIO_SERVER_PORT=7860
31
+ mongo:
32
+ image: mongo:latest
33
+ ports:
34
+ - 27017:27017
35
+ expose:
36
+ - 27017
37
+ volumes:
38
+ - mongo-data:/data/db
39
+ qdrant:
40
+ image: qdrant/qdrant:latest
41
+ restart: always
42
+ ports:
43
+ - 6333:6333
44
+ - 6334:6334
45
+ expose:
46
+ - 6333
47
+ - 6334
48
+ - 6335
49
+ configs:
50
+ - source: qdrant_config
51
+ target: /qdrant/config/production.yaml
52
+ volumes:
53
+ - qdrant-data:/qdrant/storage
54
+
55
+
56
+
57
+
58
+ configs:
59
+ qdrant_config:
60
+ content: |
61
+ log_level: INFO
62
+
63
+ volumes:
64
+ mongo-data:
65
+ driver: local
66
+ qdrant-data:
67
+ driver: local
68
+ ollama:
69
+ driver: local
70
+
71
+ networks:
72
+ net:
project/shared.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains shared functions used by multiple files
2
+ import os
3
+ import sys
4
+
5
+ import pymongo
6
+ from dotenv import load_dotenv
7
+ from langchain_community.embeddings import OllamaEmbeddings
8
+ from langchain_openai.chat_models import ChatOpenAI
9
+ from qdrant_client import QdrantClient
10
+ from langchain_community.llms import Ollama
11
+
12
+
13
+ # Unused since usage limit reached since years ago...rip
14
+ def getOpenAiModel():
15
+ MODEL = "gpt-3.5-turbo"
16
+ try:
17
+ load_dotenv(override=True)
18
+ except Exception:
19
+ load_dotenv(sys.path[1] + "/.env", override=True)
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+ return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
22
+
23
+
24
+ # Create a mongoDB connection
25
+ def getMongoClient():
26
+ try:
27
+ load_dotenv(override=True)
28
+ except Exception:
29
+ load_dotenv(sys.path[1] + "/.env", override=True)
30
+ DATABASE_HOST = os.getenv("DATABASE_HOST")
31
+ return pymongo.MongoClient(DATABASE_HOST)
32
+
33
+
34
+ # Create a qdrant connection
35
+ def getQdrantClient():
36
+ try:
37
+ load_dotenv(override=True)
38
+ except Exception:
39
+ load_dotenv(sys.path[1] + "/.env", override=True)
40
+ USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
41
+ QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
42
+ QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
43
+ if USE_QDRANT_CLOUD=="True":
44
+ return QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
45
+ else:
46
+ return QdrantClient(url=QDRANT_CLOUD_URL)
47
+
48
+
49
+ # Setup the text embedder
50
+ def getEmbeddingsModel(MODEL="llama3.2"):
51
+ try:
52
+ load_dotenv(override=True)
53
+ except Exception:
54
+ load_dotenv(sys.path[1] + "/.env", override=True)
55
+ USE_DOCKER = os.getenv("USE_DOCKER")
56
+ if USE_DOCKER == "True":
57
+ return OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
58
+ else:
59
+ return OllamaEmbeddings(model=MODEL)
60
+
61
+
62
+ # Setup the model
63
+ def getModel(MODEL="llama3.2"):
64
+ try:
65
+ load_dotenv(override=True)
66
+ except Exception:
67
+ load_dotenv(sys.path[1] + "/.env", override=True)
68
+ USE_DOCKER = os.getenv("USE_DOCKER")
69
+ if USE_DOCKER == "True":
70
+ return Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
71
+ else:
72
+ return Ollama(model=MODEL)
73
+
74
+
75
+ # Setup clearML
76
+ def setupClearML():
77
+ try:
78
+ load_dotenv(override=True)
79
+ except Exception:
80
+ load_dotenv(sys.path[1] + "/.env", override=True)
81
+ CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
82
+ CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
83
+ CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
84
+ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
85
+ CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
86
+ return (
87
+ CLEARML_WEB_HOST,
88
+ CLEARML_API_HOST,
89
+ CLEARML_FILES_HOST,
90
+ CLEARML_API_ACCESS_KEY,
91
+ CLEARML_API_SECRET_KEY,
92
+ )