Commit
·
2af0eb7
1
Parent(s):
90d383a
Moving HF spaces to HF model
Browse files- .devcontainer/Dockerfile +7 -0
- .devcontainer/devcontainer.json +37 -0
- project/.env.example +38 -0
- project/.gitignore +2 -0
- project/ClearML/DataCollectionPipeline.py +173 -0
- project/ClearML/FeaturePipeline.py +168 -0
- project/ClearML/InferencePipeline.py +214 -0
- project/DataCollectionPipeline.ipynb +200 -0
- project/Dockerfile +11 -0
- project/FeaturePipeline.ipynb +107 -0
- project/InferencePipeline.ipynb +181 -0
- project/README.md +62 -0
- project/Tools/QdrantTools.ipynb +192 -0
- project/Tools/__pycache__/shared.cpython-312.pyc +0 -0
- project/Tools/mongoTools.ipynb +99 -0
- project/Tools/shared.py +92 -0
- project/TrainingPipeline.ipynb +34 -0
- project/__pycache__/shared.cpython-312.pyc +0 -0
- project/app.py +126 -0
- project/docker-compose.yml +72 -0
- project/shared.py +92 -0
.devcontainer/Dockerfile
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM mcr.microsoft.com/devcontainers/python:1-3.12-bullseye
|
2 |
+
|
3 |
+
RUN pip install --no-cache-dir python-dotenv==1.0.1 langchain==0.3.10 langchain-community==0.3.10 \
|
4 |
+
qdrant-client==1.12.1 gradio==5.8.0 pymongo==4.10.1 requests==2.32.3 bs4==0.0.2 ipykernel==6.29.5 ipython==8.27.0 \
|
5 |
+
ipywidgets==8.1.5 jupyter==1.1.1 jupyter-client==8.6.2 jupyter-console==6.6.3 jupyter-core==5.7.2 jupyter-server==2.14.2 \
|
6 |
+
jupyter-events==0.10.0 jupyter-lsp==2.2.5 jupyter-server-terminals==0.5.3 jupyterlab==4.2.5 jupyterlab-pygments==0.3.0 \
|
7 |
+
jupyterlab-quarto==0.3.5 jupyterlab-server==2.27.3 jupyterlab-widgets==3.0.13 langchain-openai==0.2.11 clearml==1.16.4
|
.devcontainer/devcontainer.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
2 |
+
// README at: https://github.com/devcontainers/templates/tree/main/src/python
|
3 |
+
{
|
4 |
+
"name": "Python 3",
|
5 |
+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
6 |
+
"build": {
|
7 |
+
// Path is relative to the devcontainer.json file.
|
8 |
+
"dockerfile": "Dockerfile"
|
9 |
+
},
|
10 |
+
|
11 |
+
// Features to add to the dev container. More info: https://containers.dev/features.
|
12 |
+
// "features": {},
|
13 |
+
|
14 |
+
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
15 |
+
// "forwardPorts": [],
|
16 |
+
|
17 |
+
// Use 'postCreateCommand' to run commands after the container is created.
|
18 |
+
// "postCreateCommand": "pip3 install --user -r requirements.txt",
|
19 |
+
|
20 |
+
// Configure tool-specific properties.
|
21 |
+
"customizations": {
|
22 |
+
// Install jupyter, mongodb, and docker
|
23 |
+
"vscode": {
|
24 |
+
"extensions": [
|
25 |
+
"ms-toolsai.jupyter",
|
26 |
+
"mongodb.mongodb-vscode",
|
27 |
+
"ms-azuretools.vscode-docker"
|
28 |
+
],
|
29 |
+
"settings": {
|
30 |
+
|
31 |
+
}
|
32 |
+
}
|
33 |
+
},
|
34 |
+
|
35 |
+
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
|
36 |
+
"remoteUser": "root"
|
37 |
+
}
|
project/.env.example
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MongoDB database
|
2 |
+
# Change DATABASE_HOST if you run mongodb from the web
|
3 |
+
# https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup
|
4 |
+
DATABASE_HOST="mongodb://host.docker.internal/27017"
|
5 |
+
|
6 |
+
# Qdrant vector database
|
7 |
+
# USE_QDRANT_CLOUD="False" if you setup the qdrant docker container(docker compose)
|
8 |
+
# USE_QDRANT_CLOUD="True" if you run qdrant from the web
|
9 |
+
USE_QDRANT_CLOUD="False"
|
10 |
+
# Change QDRANT_CLOUD_URL and fill out QDRANT_APIKEY if you run qdrant from the web
|
11 |
+
# https://qdrant.tech/documentation/cloud/create-cluster/
|
12 |
+
QDRANT_CLOUD_URL="host.docker.internal:6333"
|
13 |
+
QDRANT_APIKEY=your_qdrant_apikey
|
14 |
+
|
15 |
+
|
16 |
+
# Ollama
|
17 |
+
# USE_DOCKER="True" if you setup the ollama docker container(docker compose)
|
18 |
+
# USE_DOCKER="False" if you run ollama serve from command line
|
19 |
+
USE_DOCKER="True"
|
20 |
+
|
21 |
+
# ClearML(optional for running files in ClearML folder)
|
22 |
+
# https://clear.ml/docs/latest/docs/clearml_serving/clearml_serving_setup/
|
23 |
+
CLEARML_WEB_HOST=your_clearml_web_host(link)
|
24 |
+
CLEARML_API_HOST=your_clearml_api_host(link)
|
25 |
+
CLEARML_FILES_HOST=your_clearml_files_host(link)
|
26 |
+
CLEARML_API_ACCESS_KEY=your_clearml_api_access_key(str)
|
27 |
+
CLEARML_API_SECRET_KEY=your_clearml_api_secret_key(str)
|
28 |
+
|
29 |
+
# --- OpenAI is not used for this project(optional), but a function to pull the model is given. ---
|
30 |
+
|
31 |
+
# OpenAI API Config(unused)
|
32 |
+
# https://platform.openai.com/api-keys
|
33 |
+
OPENAI_MODEL_ID=gpt-4o-mini
|
34 |
+
OPENAI_API_KEY=your_openai_api_key
|
35 |
+
|
36 |
+
# Huggingface API Config(unused)
|
37 |
+
# https://huggingface.co/docs/hub/en/security-tokens
|
38 |
+
HUGGINGFACE_ACCESS_TOKEN=your_huggingface_access_token
|
project/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.gradio
|
2 |
+
.env
|
project/ClearML/DataCollectionPipeline.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# See README for more info on how the DataCollectionPipeline works
|
2 |
+
# The ETL pipeline is part of the DataCollectionPipeline
|
3 |
+
# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
import subprocess
|
7 |
+
import sys
|
8 |
+
import tempfile
|
9 |
+
import time
|
10 |
+
|
11 |
+
import pymongo
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
from clearml import PipelineDecorator
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
|
17 |
+
# Setup ClearML
|
18 |
+
try:
|
19 |
+
load_dotenv(override=True)
|
20 |
+
except Exception:
|
21 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
22 |
+
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
|
23 |
+
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
|
24 |
+
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
|
25 |
+
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
26 |
+
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
27 |
+
|
28 |
+
# Input into the Data Collection Pipeline is a list of links to domains
|
29 |
+
links = [
|
30 |
+
"https://www.ros.org/",
|
31 |
+
"https://docs.nav2.org/",
|
32 |
+
"https://moveit.ai/",
|
33 |
+
"https://gazebosim.org/home",
|
34 |
+
"https://github.com/ros2/ros2",
|
35 |
+
"https://github.com/ros-navigation/navigation2",
|
36 |
+
"https://github.com/moveit/moveit2",
|
37 |
+
"https://github.com/gazebosim/gazebo-classic",
|
38 |
+
]
|
39 |
+
links = ["https://www.ros.org/", "https://github.com/ros2/ros2"]
|
40 |
+
|
41 |
+
|
42 |
+
# ETL pipeline
|
43 |
+
@PipelineDecorator.component(cache=False, return_values=["documents, codes"])
|
44 |
+
def ETL_Pipeline(links):
|
45 |
+
# Create a mongoDB connection to check for duplicates before inserting
|
46 |
+
try:
|
47 |
+
load_dotenv(override=True)
|
48 |
+
except Exception:
|
49 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
50 |
+
DATABASE_HOST = os.getenv("DATABASE_HOST")
|
51 |
+
mongoHost = pymongo.MongoClient(DATABASE_HOST)
|
52 |
+
mongoDatabase = mongoHost["twin"]
|
53 |
+
|
54 |
+
# Extract data from links and their subdirectories(using crawlers)
|
55 |
+
documents = []
|
56 |
+
codes = []
|
57 |
+
for link in links:
|
58 |
+
# Web scraper/crawler for github links
|
59 |
+
if "https://github.com" in link:
|
60 |
+
# Do not revisit a link already in the database
|
61 |
+
mongoCollection = mongoDatabase["Github"]
|
62 |
+
result = mongoCollection.find_one({"link": link})
|
63 |
+
if result is None:
|
64 |
+
# Modified GithubCrawler from LLM-Engineer for scraping github
|
65 |
+
local_temp = tempfile.mkdtemp()
|
66 |
+
try:
|
67 |
+
os.chdir(local_temp)
|
68 |
+
subprocess.run(["git", "clone", link])
|
69 |
+
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
|
70 |
+
tree = {}
|
71 |
+
for root, _, files in os.walk(repo_path):
|
72 |
+
dir = root.replace(repo_path, "").lstrip("/")
|
73 |
+
if dir.startswith((".git", ".toml", ".lock", ".png")):
|
74 |
+
continue
|
75 |
+
for file in files:
|
76 |
+
if file.endswith((".git", ".toml", ".lock", ".png")):
|
77 |
+
continue
|
78 |
+
file_path = os.path.join(dir, file)
|
79 |
+
with open(
|
80 |
+
os.path.join(root, file), "r", errors="ignore"
|
81 |
+
) as f:
|
82 |
+
tree[file_path] = f.read().replace(" ", "")
|
83 |
+
except Exception:
|
84 |
+
print(f"Error scrapping {link}")
|
85 |
+
finally:
|
86 |
+
shutil.rmtree(local_temp)
|
87 |
+
# Correct the link
|
88 |
+
r = requests.get(link)
|
89 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
90 |
+
# Find the file path to any of the files in the repository
|
91 |
+
link_element = soup.find("a", attrs={"class": "Link--primary"})
|
92 |
+
path = link_element.get("href")
|
93 |
+
path = path.rsplit("/", 1)[0]
|
94 |
+
# Push all the subdirectories to mongo
|
95 |
+
for subdirectory in tree:
|
96 |
+
text = tree[subdirectory]
|
97 |
+
# Transform the data
|
98 |
+
# Get rid of repeating \n characters and spaces
|
99 |
+
text = text.replace("\t", " ")
|
100 |
+
text = text.replace("\n", " ")
|
101 |
+
text_len = len(text)
|
102 |
+
for i in range(text_len):
|
103 |
+
while (
|
104 |
+
i + 1 < text_len
|
105 |
+
and text[i] == " "
|
106 |
+
and text[i + 1] == " "
|
107 |
+
):
|
108 |
+
text = text[:i] + text[i + 1 :]
|
109 |
+
text_len -= 1
|
110 |
+
codes.append(
|
111 |
+
{
|
112 |
+
"link": "https://github.com"
|
113 |
+
+ path
|
114 |
+
+ "/"
|
115 |
+
+ subdirectory,
|
116 |
+
"type": "Github",
|
117 |
+
"content": text,
|
118 |
+
}
|
119 |
+
)
|
120 |
+
# Web scraper/crawler for other links(Documents)
|
121 |
+
else:
|
122 |
+
# Do not revisit a link already in the database
|
123 |
+
mongoCollection = mongoDatabase["Document"]
|
124 |
+
result = mongoCollection.find_one({"link": link})
|
125 |
+
if result is None:
|
126 |
+
# Get all text in the website
|
127 |
+
r = requests.get(link)
|
128 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
129 |
+
soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
|
130 |
+
text = soup.get_text()
|
131 |
+
# Transform the data
|
132 |
+
# Get rid of repeating \n characters and spaces
|
133 |
+
text = text.replace("\t", " ")
|
134 |
+
text = text.replace("\n", " ")
|
135 |
+
text_len = len(text)
|
136 |
+
for i in range(text_len):
|
137 |
+
while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
|
138 |
+
text = text[:i] + text[i + 1 :]
|
139 |
+
text_len -= 1
|
140 |
+
documents.append({"link": link, "type": "Document", "content": text})
|
141 |
+
# Also crawl through all subdirectorys in the link(related links)
|
142 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
143 |
+
subdirectories = [a.get("href") for a in soup.find_all("a")]
|
144 |
+
for subdirectory in subdirectories:
|
145 |
+
if (
|
146 |
+
subdirectory is not None
|
147 |
+
and mongoCollection.find_one({"link": link + subdirectory})
|
148 |
+
is not None
|
149 |
+
):
|
150 |
+
links.append(link + subdirectory)
|
151 |
+
# Avoid spamming sites
|
152 |
+
time.sleep(1)
|
153 |
+
# Each document has a link, type(github or other), and content(text)
|
154 |
+
mongoCollection = mongoDatabase["Document"]
|
155 |
+
mongoCollection.insert_many(documents)
|
156 |
+
mongoCollection = mongoDatabase["Github"]
|
157 |
+
mongoCollection.insert_many(codes)
|
158 |
+
return documents, codes
|
159 |
+
|
160 |
+
|
161 |
+
# Allow ClearML to monitor and run the ETL pipeline
|
162 |
+
@PipelineDecorator.pipeline(
|
163 |
+
name="Data Collection Pipeline",
|
164 |
+
project="RAG LLM",
|
165 |
+
version="0.2",
|
166 |
+
)
|
167 |
+
def main():
|
168 |
+
return ETL_Pipeline(links)
|
169 |
+
|
170 |
+
|
171 |
+
if __name__ == "__main__":
|
172 |
+
PipelineDecorator.run_locally()
|
173 |
+
main()
|
project/ClearML/FeaturePipeline.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# See README for more info on how the FeaturePipeline works
|
2 |
+
# The Ingestion pipeline is part of the FeaturePipeline
|
3 |
+
# Make sure to ollama serve before running!
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
import pymongo
|
8 |
+
from clearml import PipelineDecorator
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
11 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
12 |
+
from qdrant_client import QdrantClient
|
13 |
+
from qdrant_client.http.models import Distance, PointStruct, VectorParams
|
14 |
+
|
15 |
+
# Setup ClearML
|
16 |
+
try:
|
17 |
+
load_dotenv(override=True)
|
18 |
+
except Exception:
|
19 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
20 |
+
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
|
21 |
+
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
|
22 |
+
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
|
23 |
+
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
24 |
+
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
25 |
+
|
26 |
+
|
27 |
+
@PipelineDecorator.component(cache=False, return_values=["links, resultTypes, texts"])
|
28 |
+
def retreiveDocuments():
|
29 |
+
links = []
|
30 |
+
resultTypes = []
|
31 |
+
texts = []
|
32 |
+
# Create a mongoDB connection
|
33 |
+
try:
|
34 |
+
load_dotenv(override=True)
|
35 |
+
except Exception:
|
36 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
37 |
+
DATABASE_HOST = os.getenv("DATABASE_HOST")
|
38 |
+
mongoHost = pymongo.MongoClient(DATABASE_HOST)
|
39 |
+
mongoDatabase = mongoHost["twin"]
|
40 |
+
collections = mongoDatabase.list_collection_names()
|
41 |
+
for collection in collections:
|
42 |
+
mongoCollection = mongoDatabase[collection]
|
43 |
+
results = mongoCollection.find()
|
44 |
+
for result in results:
|
45 |
+
# For each document, split it into chunks
|
46 |
+
links.append(result["link"])
|
47 |
+
resultTypes.append(result["type"])
|
48 |
+
texts.append(result["content"])
|
49 |
+
return links, resultTypes, texts
|
50 |
+
|
51 |
+
|
52 |
+
@PipelineDecorator.component(cache=False, return_values=["cleanTexts"])
|
53 |
+
def cleanDocuments(texts):
|
54 |
+
cleanTexts = []
|
55 |
+
for text in texts:
|
56 |
+
cleanTexts.append("".join(char for char in text if 32 <= ord(char) <= 126))
|
57 |
+
return cleanTexts
|
58 |
+
|
59 |
+
|
60 |
+
@PipelineDecorator.component(cache=False, return_values=["chunks", "chunkNums"])
|
61 |
+
def chunkDocuments(texts):
|
62 |
+
chunks = []
|
63 |
+
chunkNums = []
|
64 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
65 |
+
chunk_size=500,
|
66 |
+
chunk_overlap=20,
|
67 |
+
length_function=len,
|
68 |
+
is_separator_regex=False,
|
69 |
+
)
|
70 |
+
for text in texts:
|
71 |
+
textChunks = text_splitter.split_text(text)
|
72 |
+
chunkNum = 0
|
73 |
+
for chunk in textChunks:
|
74 |
+
chunks.append(text_splitter.split_text(chunk))
|
75 |
+
chunkNums.append(chunkNum)
|
76 |
+
chunkNum += 1
|
77 |
+
return chunks, chunkNums
|
78 |
+
|
79 |
+
|
80 |
+
@PipelineDecorator.component(cache=False, return_values=["embeddings"])
|
81 |
+
def embedChunks(chunks):
|
82 |
+
embeddings = []
|
83 |
+
# Setup the text embedder
|
84 |
+
MODEL = "llama3.2"
|
85 |
+
try:
|
86 |
+
load_dotenv(override=True)
|
87 |
+
except Exception:
|
88 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
89 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
90 |
+
if USE_DOCKER == "True":
|
91 |
+
embeddingsModel = OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
|
92 |
+
else:
|
93 |
+
embeddingsModel = OllamaEmbeddings(model=MODEL)
|
94 |
+
for chunk in chunks:
|
95 |
+
embeddings.append(embeddingsModel.embed_query(chunk))
|
96 |
+
return embeddings
|
97 |
+
|
98 |
+
|
99 |
+
@PipelineDecorator.component(cache=False)
|
100 |
+
def storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums):
|
101 |
+
# Create a qdrant connection
|
102 |
+
try:
|
103 |
+
load_dotenv(override=True)
|
104 |
+
except Exception:
|
105 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
106 |
+
USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
|
107 |
+
QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
|
108 |
+
QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
|
109 |
+
if USE_QDRANT_CLOUD:
|
110 |
+
qClient = QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
|
111 |
+
else:
|
112 |
+
qClient = QdrantClient(url=QDRANT_CLOUD_URL)
|
113 |
+
|
114 |
+
# Create qdrant collections to store embeddings
|
115 |
+
if not qClient.collection_exists("Github"):
|
116 |
+
qClient.create_collection(
|
117 |
+
collection_name="Github",
|
118 |
+
vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
|
119 |
+
)
|
120 |
+
if not qClient.collection_exists("Document"):
|
121 |
+
qClient.create_collection(
|
122 |
+
collection_name="Document",
|
123 |
+
vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
|
124 |
+
)
|
125 |
+
chunkIndex = -1
|
126 |
+
documentIndex = -1
|
127 |
+
for chunkNum in chunkNums:
|
128 |
+
chunkIndex += 1
|
129 |
+
if chunkNum == 0:
|
130 |
+
documentIndex += 1
|
131 |
+
# Store all documents from each MongoDB collection into qdrant
|
132 |
+
# Create embeddings for each chunk, of length 2048 using the embedding model
|
133 |
+
# Store the embedding along with some metadata into the Qdrant vector database
|
134 |
+
qClient.upsert(
|
135 |
+
collection_name=resultTypes[documentIndex],
|
136 |
+
wait=True,
|
137 |
+
points=[
|
138 |
+
PointStruct(
|
139 |
+
id=chunkIndex,
|
140 |
+
vector=embeddings[chunkIndex],
|
141 |
+
payload={
|
142 |
+
"link": links[documentIndex],
|
143 |
+
"type": resultTypes[documentIndex],
|
144 |
+
"chunk": chunkNum,
|
145 |
+
"text": chunks[chunkIndex],
|
146 |
+
},
|
147 |
+
)
|
148 |
+
],
|
149 |
+
)
|
150 |
+
|
151 |
+
|
152 |
+
# Ingestion Pipeline
|
153 |
+
@PipelineDecorator.pipeline(
|
154 |
+
name="Feature Pipeline",
|
155 |
+
project="RAG LLM",
|
156 |
+
version="0.2",
|
157 |
+
)
|
158 |
+
def main():
|
159 |
+
links, resultTypes, texts = retreiveDocuments()
|
160 |
+
texts = cleanDocuments(texts)
|
161 |
+
chunks, chunkNums = chunkDocuments(texts)
|
162 |
+
embeddings = embedChunks(chunks)
|
163 |
+
storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums)
|
164 |
+
|
165 |
+
|
166 |
+
if __name__ == "__main__":
|
167 |
+
PipelineDecorator.run_locally()
|
168 |
+
main()
|
project/ClearML/InferencePipeline.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# See README for more info on how the DataCollectionPipeline works
|
2 |
+
# The retrieval pipeline is part of the DataCollectionPipeline
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from operator import itemgetter
|
6 |
+
|
7 |
+
from clearml import PipelineDecorator
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
11 |
+
from langchain_community.llms import Ollama
|
12 |
+
from qdrant_client import QdrantClient
|
13 |
+
|
14 |
+
# Setup ClearML
|
15 |
+
try:
|
16 |
+
load_dotenv(override=True)
|
17 |
+
except Exception:
|
18 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
19 |
+
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
|
20 |
+
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
|
21 |
+
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
|
22 |
+
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
23 |
+
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
24 |
+
|
25 |
+
|
26 |
+
# Query expansion(I only generate one additional prompt for simplicity)
|
27 |
+
@PipelineDecorator.component(cache=False, return_values=["newQuery"])
|
28 |
+
def queryExpansion(query):
|
29 |
+
# Setup the model
|
30 |
+
MODEL = "llama3.2"
|
31 |
+
try:
|
32 |
+
load_dotenv(override=True)
|
33 |
+
except Exception:
|
34 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
35 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
36 |
+
if USE_DOCKER == "True":
|
37 |
+
model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
|
38 |
+
else:
|
39 |
+
model = Ollama(model=MODEL)
|
40 |
+
|
41 |
+
template = """
|
42 |
+
Rewrite the prompt. The new prompt must offer a different perspective.
|
43 |
+
Do not change the meaning. Output only the rewritten prompt with no introduction.
|
44 |
+
Prompt: {prompt}
|
45 |
+
"""
|
46 |
+
prompt = PromptTemplate.from_template(template)
|
47 |
+
chain = {"prompt": itemgetter("prompt")} | prompt | model
|
48 |
+
return chain.invoke({"prompt": query})
|
49 |
+
|
50 |
+
|
51 |
+
# Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)
|
52 |
+
@PipelineDecorator.component(cache=False, return_values=["codingQuestion"])
|
53 |
+
def selfQuerying(query):
|
54 |
+
# Setup the model
|
55 |
+
MODEL = "llama3.2"
|
56 |
+
try:
|
57 |
+
load_dotenv(override=True)
|
58 |
+
except Exception:
|
59 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
60 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
61 |
+
if USE_DOCKER == "True":
|
62 |
+
model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
|
63 |
+
else:
|
64 |
+
model = Ollama(model=MODEL)
|
65 |
+
|
66 |
+
template = """
|
67 |
+
You are an AI assistant. You must determine if the prompt requires code as the answer.
|
68 |
+
Output a 1 if it is or a 0 if it is not and nothing else.
|
69 |
+
Prompt: {prompt}
|
70 |
+
"""
|
71 |
+
prompt = PromptTemplate.from_template(template)
|
72 |
+
chain = {"prompt": itemgetter("prompt")} | prompt | model
|
73 |
+
return chain.invoke({"prompt": query})
|
74 |
+
|
75 |
+
|
76 |
+
# Filtered vector search for each of the N=2 queries after expansion
|
77 |
+
@PipelineDecorator.component(cache=False, return_values=["results1, results2"])
|
78 |
+
def filteredVectorSearch(query, newQuery, codingQuestion):
|
79 |
+
# Create a qdrant connection
|
80 |
+
try:
|
81 |
+
load_dotenv(override=True)
|
82 |
+
except Exception:
|
83 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
84 |
+
USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
|
85 |
+
QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
|
86 |
+
QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
|
87 |
+
if USE_QDRANT_CLOUD=="True":
|
88 |
+
qClient = QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
|
89 |
+
else:
|
90 |
+
qClient = QdrantClient(url=QDRANT_CLOUD_URL)
|
91 |
+
|
92 |
+
# Setup the text embedder
|
93 |
+
MODEL = "llama3.2"
|
94 |
+
try:
|
95 |
+
load_dotenv(override=True)
|
96 |
+
except Exception:
|
97 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
98 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
99 |
+
if USE_DOCKER == "True":
|
100 |
+
embeddingsModel = OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
|
101 |
+
else:
|
102 |
+
embeddingsModel = OllamaEmbeddings(model=MODEL)
|
103 |
+
|
104 |
+
# Search the related collection
|
105 |
+
relatedCollection = "Document"
|
106 |
+
if codingQuestion == "1":
|
107 |
+
relatedCollection = "Github"
|
108 |
+
results1 = qClient.search(
|
109 |
+
collection_name=relatedCollection,
|
110 |
+
query_vector=embeddingsModel.embed_query(query),
|
111 |
+
limit=10,
|
112 |
+
)
|
113 |
+
results2 = qClient.search(
|
114 |
+
collection_name=relatedCollection,
|
115 |
+
query_vector=embeddingsModel.embed_query(newQuery),
|
116 |
+
limit=10,
|
117 |
+
)
|
118 |
+
return results1, results2
|
119 |
+
|
120 |
+
|
121 |
+
# Collecting results
|
122 |
+
@PipelineDecorator.component(cache=False, return_values=["results"])
|
123 |
+
def collectingResults(results1, results2):
|
124 |
+
return results1 + results2
|
125 |
+
|
126 |
+
|
127 |
+
# Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)
|
128 |
+
@PipelineDecorator.component(cache=False, return_values=["topTexts"])
|
129 |
+
def reranking(results):
|
130 |
+
ids = [result.id for result in results]
|
131 |
+
scores = [result.score for result in results]
|
132 |
+
topIds = []
|
133 |
+
topIndexes = []
|
134 |
+
for x in range(3):
|
135 |
+
maxScore = 0
|
136 |
+
maxIndex = 0
|
137 |
+
for i in range(len(ids)):
|
138 |
+
if ids[i] not in topIds and scores[i] > maxScore:
|
139 |
+
maxScore = scores[i]
|
140 |
+
maxIndex = i
|
141 |
+
topIds.append(ids[maxIndex])
|
142 |
+
topIndexes.append(maxIndex)
|
143 |
+
texts = [result.payload["text"] for result in results]
|
144 |
+
topTexts = ""
|
145 |
+
for index in topIndexes:
|
146 |
+
topTexts += texts[index][0]
|
147 |
+
return topTexts
|
148 |
+
|
149 |
+
|
150 |
+
# Building prompt
|
151 |
+
@PipelineDecorator.component(cache=False, return_values=["prompt"])
|
152 |
+
def buildingPrompt(codingQuestion):
|
153 |
+
if codingQuestion == "1":
|
154 |
+
template = """
|
155 |
+
Write code for the following question given the related coding document below.
|
156 |
+
|
157 |
+
Document: {document}
|
158 |
+
Question: {question}
|
159 |
+
"""
|
160 |
+
return PromptTemplate.from_template(template)
|
161 |
+
else:
|
162 |
+
template = """
|
163 |
+
Answer the question based on the document below. If you can't answer the question, reply "I don't know"
|
164 |
+
|
165 |
+
Document: {document}
|
166 |
+
Question: {question}
|
167 |
+
"""
|
168 |
+
return PromptTemplate.from_template(template)
|
169 |
+
|
170 |
+
|
171 |
+
# Obtaining answer
|
172 |
+
@PipelineDecorator.component(cache=False, return_values=["answer"])
|
173 |
+
def obtainingAnswer(query, prompt, topTexts):
|
174 |
+
# Setup the model
|
175 |
+
MODEL = "llama3.2"
|
176 |
+
try:
|
177 |
+
load_dotenv(override=True)
|
178 |
+
except Exception:
|
179 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
180 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
181 |
+
if USE_DOCKER == "True":
|
182 |
+
model = Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
|
183 |
+
else:
|
184 |
+
model = Ollama(model=MODEL)
|
185 |
+
|
186 |
+
chain = (
|
187 |
+
{"document": itemgetter("document"), "question": itemgetter("question")}
|
188 |
+
| prompt
|
189 |
+
| model
|
190 |
+
)
|
191 |
+
chain.invoke({"document": topTexts, "question": query})
|
192 |
+
|
193 |
+
|
194 |
+
# Inference Pipeline
|
195 |
+
@PipelineDecorator.pipeline(
|
196 |
+
name="Inference Pipeline",
|
197 |
+
project="RAG LLM",
|
198 |
+
version="0.1",
|
199 |
+
)
|
200 |
+
def main():
|
201 |
+
# User query
|
202 |
+
query = "What operating system was ROS written for?"
|
203 |
+
newQuery = queryExpansion(query)
|
204 |
+
codingQuestion = selfQuerying(query)
|
205 |
+
results1, results2 = filteredVectorSearch(query, newQuery, codingQuestion)
|
206 |
+
results = collectingResults(results1, results2)
|
207 |
+
topTexts = reranking(results)
|
208 |
+
prompt = buildingPrompt(codingQuestion)
|
209 |
+
return obtainingAnswer(query, prompt, topTexts)
|
210 |
+
|
211 |
+
|
212 |
+
if __name__ == "__main__":
|
213 |
+
PipelineDecorator.run_locally()
|
214 |
+
main()
|
project/DataCollectionPipeline.ipynb
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"Visiting link: https://github.com/ros2/ros2\n"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"name": "stderr",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"Cloning into 'ros2'...\n"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"name": "stdout",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/README.md\n",
|
27 |
+
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/.gitignore\n",
|
28 |
+
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/CODEOWNERS\n",
|
29 |
+
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
|
30 |
+
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/src/.gitkeep\n"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"data": {
|
35 |
+
"text/plain": [
|
36 |
+
"InsertManyResult([ObjectId('675531b926a728d5b045a2e6'), ObjectId('675531b926a728d5b045a2e7'), ObjectId('675531b926a728d5b045a2e8'), ObjectId('675531b926a728d5b045a2e9'), ObjectId('675531b926a728d5b045a2ea')], acknowledged=True)"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
"execution_count": 1,
|
40 |
+
"metadata": {},
|
41 |
+
"output_type": "execute_result"
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"source": [
|
45 |
+
"# See README for more info on how the DataCollectionPipeline works\n",
|
46 |
+
"# The ETL pipeline is part of the DataCollectionPipeline\n",
|
47 |
+
"# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n",
|
48 |
+
"import requests\n",
|
49 |
+
"from bs4 import BeautifulSoup\n",
|
50 |
+
"import time\n",
|
51 |
+
"import os\n",
|
52 |
+
"import shutil\n",
|
53 |
+
"import subprocess\n",
|
54 |
+
"import tempfile\n",
|
55 |
+
"from shared import getMongoClient\n",
|
56 |
+
"\n",
|
57 |
+
"# Input into the Data Collection Pipeline is a list of links to domains\n",
|
58 |
+
"links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
|
59 |
+
"links = ['https://www.ros.org/', 'https://github.com/ros2/ros2']\n",
|
60 |
+
"\n",
|
61 |
+
"# Create a mongoDB connection\n",
|
62 |
+
"mongoHost = getMongoClient()\n",
|
63 |
+
"mongoDatabase = mongoHost[\"twin\"]\n",
|
64 |
+
"\n",
|
65 |
+
"# ETL pipeline\n",
|
66 |
+
"# Extract data from links and their subdirectories(using crawlers)\n",
|
67 |
+
"documents = []\n",
|
68 |
+
"codes = []\n",
|
69 |
+
"for link in links:\n",
|
70 |
+
" # Web scraper/crawler for github links\n",
|
71 |
+
" if \"https://github.com\" in link:\n",
|
72 |
+
" # Do not revisit a link already in the database\n",
|
73 |
+
" mongoCollection = mongoDatabase[\"Github\"]\n",
|
74 |
+
" result = mongoCollection.find_one({\"link\": link})\n",
|
75 |
+
" if result is None:\n",
|
76 |
+
" print(\"Visiting link: \", link)\n",
|
77 |
+
" # Modified GithubCrawler from LLM-Engineer for scraping github\n",
|
78 |
+
" local_temp = tempfile.mkdtemp()\n",
|
79 |
+
" try:\n",
|
80 |
+
" os.chdir(local_temp)\n",
|
81 |
+
" subprocess.run([\"git\", \"clone\", link])\n",
|
82 |
+
" repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n",
|
83 |
+
" tree = {}\n",
|
84 |
+
" for root, _, files in os.walk(repo_path):\n",
|
85 |
+
" dir = root.replace(repo_path, \"\").lstrip(\"/\")\n",
|
86 |
+
" if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
|
87 |
+
" continue\n",
|
88 |
+
" for file in files:\n",
|
89 |
+
" if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
|
90 |
+
" continue\n",
|
91 |
+
" file_path = os.path.join(dir, file)\n",
|
92 |
+
" with open(\n",
|
93 |
+
" os.path.join(root, file), \"r\", errors=\"ignore\"\n",
|
94 |
+
" ) as f:\n",
|
95 |
+
" tree[file_path] = f.read().replace(\" \", \"\")\n",
|
96 |
+
" except Exception:\n",
|
97 |
+
" print(f\"Error scrapping {link}\")\n",
|
98 |
+
" finally:\n",
|
99 |
+
" shutil.rmtree(local_temp)\n",
|
100 |
+
" # Correct the link\n",
|
101 |
+
" r = requests.get(link)\n",
|
102 |
+
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
|
103 |
+
" # Find the file path to any of the files in the repository\n",
|
104 |
+
" link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n",
|
105 |
+
" path = link_element.get(\"href\")\n",
|
106 |
+
" path = path.rsplit(\"/\", 1)[0]\n",
|
107 |
+
" # Push all the subdirectories to mongo\n",
|
108 |
+
" for subdirectory in tree:\n",
|
109 |
+
" print(\n",
|
110 |
+
" f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
|
111 |
+
" )\n",
|
112 |
+
" text = tree[subdirectory]\n",
|
113 |
+
" # Transform the data\n",
|
114 |
+
" # Get rid of repeating \\n characters and spaces\n",
|
115 |
+
" text = text.replace(\"\\t\", \" \")\n",
|
116 |
+
" text = text.replace(\"\\n\", \" \")\n",
|
117 |
+
" text_len = len(text)\n",
|
118 |
+
" for i in range(text_len):\n",
|
119 |
+
" while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
|
120 |
+
" text = text[:i] + text[i + 1 :]\n",
|
121 |
+
" text_len -= 1\n",
|
122 |
+
" codes.append(\n",
|
123 |
+
" {\n",
|
124 |
+
" \"link\": \"https://github.com\"\n",
|
125 |
+
" + path\n",
|
126 |
+
" + \"/\"\n",
|
127 |
+
" + subdirectory,\n",
|
128 |
+
" \"type\": \"Github\",\n",
|
129 |
+
" \"content\": text,\n",
|
130 |
+
" }\n",
|
131 |
+
" )\n",
|
132 |
+
" else:\n",
|
133 |
+
" print(\"Already visited: \", link)\n",
|
134 |
+
" # Web scraper/crawler for other links(Documents)\n",
|
135 |
+
" else:\n",
|
136 |
+
" # Do not revisit a link already in the database\n",
|
137 |
+
" mongoCollection = mongoDatabase[\"Document\"]\n",
|
138 |
+
" result = mongoCollection.find_one({\"link\": link})\n",
|
139 |
+
" if result is None:\n",
|
140 |
+
" # Get all text in the website\n",
|
141 |
+
" r = requests.get(link)\n",
|
142 |
+
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
|
143 |
+
" soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
|
144 |
+
" text = soup.get_text()\n",
|
145 |
+
" # Transform the data\n",
|
146 |
+
" # Get rid of repeating \\n characters and spaces\n",
|
147 |
+
" text = text.replace(\"\\t\", \" \")\n",
|
148 |
+
" text = text.replace(\"\\n\", \" \")\n",
|
149 |
+
" text_len = len(text)\n",
|
150 |
+
" for i in range(text_len):\n",
|
151 |
+
" while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
|
152 |
+
" text = text[:i] + text[i + 1 :]\n",
|
153 |
+
" text_len -= 1\n",
|
154 |
+
" documents.append({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
|
155 |
+
" # Also crawl through all subdirectorys in the link(related links)\n",
|
156 |
+
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
|
157 |
+
" subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
|
158 |
+
" for subdirectory in subdirectories:\n",
|
159 |
+
" if (\n",
|
160 |
+
" subdirectory is not None\n",
|
161 |
+
" and mongoCollection.find_one({\"link\": link + subdirectory})\n",
|
162 |
+
" is not None\n",
|
163 |
+
" ):\n",
|
164 |
+
" print(\"Adding subdirectory: \", link + subdirectory)\n",
|
165 |
+
" links.append(link + subdirectory)\n",
|
166 |
+
" else:\n",
|
167 |
+
" print(\"Already visited: \", link)\n",
|
168 |
+
" # Avoid spamming sites\n",
|
169 |
+
" time.sleep(1)\n",
|
170 |
+
"# Each document has a link, type(github or other), and content(text)\n",
|
171 |
+
"# You can go to Tools/mongoTools to view the inserted documents\n",
|
172 |
+
"mongoCollection = mongoDatabase[\"Document\"]\n",
|
173 |
+
"mongoCollection.insert_many(documents)\n",
|
174 |
+
"mongoCollection = mongoDatabase[\"Github\"]\n",
|
175 |
+
"mongoCollection.insert_many(codes)"
|
176 |
+
]
|
177 |
+
}
|
178 |
+
],
|
179 |
+
"metadata": {
|
180 |
+
"kernelspec": {
|
181 |
+
"display_name": "Python 3",
|
182 |
+
"language": "python",
|
183 |
+
"name": "python3"
|
184 |
+
},
|
185 |
+
"language_info": {
|
186 |
+
"codemirror_mode": {
|
187 |
+
"name": "ipython",
|
188 |
+
"version": 3
|
189 |
+
},
|
190 |
+
"file_extension": ".py",
|
191 |
+
"mimetype": "text/x-python",
|
192 |
+
"name": "python",
|
193 |
+
"nbconvert_exporter": "python",
|
194 |
+
"pygments_lexer": "ipython3",
|
195 |
+
"version": "3.12.7"
|
196 |
+
}
|
197 |
+
},
|
198 |
+
"nbformat": 4,
|
199 |
+
"nbformat_minor": 2
|
200 |
+
}
|
project/Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.9
|
2 |
+
|
3 |
+
WORKDIR /gradio-app
|
4 |
+
COPY . .
|
5 |
+
RUN pip install --no-cache-dir python-dotenv==1.0.1 langchain==0.3.10 langchain-community==0.3.10 \
|
6 |
+
qdrant-client==1.12.1 gradio==5.8.0 pymongo==4.10.1 langchain-openai==0.2.11
|
7 |
+
|
8 |
+
EXPOSE 7860
|
9 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
10 |
+
|
11 |
+
CMD ["python", "app.py"]
|
project/FeaturePipeline.ipynb
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"/workspaces/RAG_LLM/project/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
|
13 |
+
" return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"# See README for more info on how the FeaturePipeline works\n",
|
19 |
+
"# The Ingestion pipeline is part of the FeaturePipeline\n",
|
20 |
+
"# Make sure to ollama serve before running!\n",
|
21 |
+
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
22 |
+
"from qdrant_client.http.models import Distance, VectorParams, PointStruct\n",
|
23 |
+
"from shared import getMongoClient, getQdrantClient, getEmbeddingsModel\n",
|
24 |
+
"\n",
|
25 |
+
"# Create a mongoDB connection\n",
|
26 |
+
"mongoHost = getMongoClient()\n",
|
27 |
+
"\n",
|
28 |
+
"# Create a qdrant connection\n",
|
29 |
+
"qClient = getQdrantClient()\n",
|
30 |
+
"\n",
|
31 |
+
"# Create qdrant collections to store embeddings\n",
|
32 |
+
"if not qClient.collection_exists(\"Github\"):\n",
|
33 |
+
" qClient.create_collection(\n",
|
34 |
+
" collection_name=\"Github\",\n",
|
35 |
+
" vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
|
36 |
+
" )\n",
|
37 |
+
"if not qClient.collection_exists(\"Document\"):\n",
|
38 |
+
" qClient.create_collection(\n",
|
39 |
+
" collection_name=\"Document\",\n",
|
40 |
+
" vectors_config=VectorParams(size=3072, distance=Distance.COSINE),\n",
|
41 |
+
" )\n",
|
42 |
+
"\n",
|
43 |
+
"# Ingestion Pipeline Setup\n",
|
44 |
+
"# Define a text cleaner\n",
|
45 |
+
"def cleanText(text):\n",
|
46 |
+
" return ''.join(char for char in text if 32 <= ord(char) <= 126)\n",
|
47 |
+
"\n",
|
48 |
+
"# Setup the text chunker\n",
|
49 |
+
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
50 |
+
" chunk_size=500,\n",
|
51 |
+
" chunk_overlap=20,\n",
|
52 |
+
" length_function=len,\n",
|
53 |
+
" is_separator_regex=False,\n",
|
54 |
+
")\n",
|
55 |
+
"\n",
|
56 |
+
"# Setup the text embedder\n",
|
57 |
+
"embeddingsModel = getEmbeddingsModel()\n",
|
58 |
+
"\n",
|
59 |
+
"# Running the ingestion pipeline\n",
|
60 |
+
"# Store all documents from each MongoDB collection into qdrant\n",
|
61 |
+
"mongoDatabase = mongoHost[\"twin\"]\n",
|
62 |
+
"collections = mongoDatabase.list_collection_names()\n",
|
63 |
+
"for collection in collections:\n",
|
64 |
+
" mongoCollection = mongoDatabase[collection]\n",
|
65 |
+
"\n",
|
66 |
+
" documents = mongoCollection.find()\n",
|
67 |
+
" id = 0\n",
|
68 |
+
" for document in documents:\n",
|
69 |
+
" # For each document, split it into chunks\n",
|
70 |
+
" link = document[\"link\"]\n",
|
71 |
+
" resultType = document[\"type\"]\n",
|
72 |
+
" text = document[\"content\"]\n",
|
73 |
+
" text = cleanText(text)\n",
|
74 |
+
" chunks = text_splitter.split_text(text)\n",
|
75 |
+
" chunkNum = 0\n",
|
76 |
+
" for chunk in chunks:\n",
|
77 |
+
" # Create embeddings for each chunk, of length 2048 using the embedding model\n",
|
78 |
+
" embedding = embeddingsModel.embed_query(chunk)\n",
|
79 |
+
" # Store the embedding along with some metadata into the Qdrant vector database\n",
|
80 |
+
" qClient.upsert(collection_name=resultType, wait=True, points=[PointStruct(id=id, vector=embedding, payload={\"link\": link, \"type\": resultType, \"chunk\": chunkNum, \"text\": chunk})])\n",
|
81 |
+
" chunkNum += 1\n",
|
82 |
+
" id += 1\n"
|
83 |
+
]
|
84 |
+
}
|
85 |
+
],
|
86 |
+
"metadata": {
|
87 |
+
"kernelspec": {
|
88 |
+
"display_name": "Python 3",
|
89 |
+
"language": "python",
|
90 |
+
"name": "python3"
|
91 |
+
},
|
92 |
+
"language_info": {
|
93 |
+
"codemirror_mode": {
|
94 |
+
"name": "ipython",
|
95 |
+
"version": 3
|
96 |
+
},
|
97 |
+
"file_extension": ".py",
|
98 |
+
"mimetype": "text/x-python",
|
99 |
+
"name": "python",
|
100 |
+
"nbconvert_exporter": "python",
|
101 |
+
"pygments_lexer": "ipython3",
|
102 |
+
"version": "3.12.7"
|
103 |
+
}
|
104 |
+
},
|
105 |
+
"nbformat": 4,
|
106 |
+
"nbformat_minor": 2
|
107 |
+
}
|
project/InferencePipeline.ipynb
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"/workspaces/RAG_LLM/project/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
|
13 |
+
" return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n",
|
14 |
+
"/workspaces/RAG_LLM/project/shared.py:70: LangChainDeprecationWarning: The class `Ollama` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaLLM``.\n",
|
15 |
+
" return Ollama(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
|
16 |
+
]
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"# See README for more info on how the DataCollectionPipeline works\n",
|
21 |
+
"# The retrieval pipeline is part of the DataCollectionPipeline\n",
|
22 |
+
"from shared import getQdrantClient, getEmbeddingsModel, getModel\n",
|
23 |
+
"from langchain_community.llms import Ollama\n",
|
24 |
+
"from langchain.prompts import PromptTemplate\n",
|
25 |
+
"from operator import itemgetter\n",
|
26 |
+
"# Create a qdrant connection\n",
|
27 |
+
"qClient = getQdrantClient()\n",
|
28 |
+
"\n",
|
29 |
+
"# Setup the text embedder\n",
|
30 |
+
"embeddingsModel = getEmbeddingsModel()\n",
|
31 |
+
"\n",
|
32 |
+
"# Setup the model\n",
|
33 |
+
"model = getModel()\n",
|
34 |
+
"\n",
|
35 |
+
"# Retrieval Pipeline\n",
|
36 |
+
"# Retrieve the chunks with the most similar embeddings from Qdrant\n",
|
37 |
+
"def retriever(text, collection):\n",
|
38 |
+
" results = qClient.search(\n",
|
39 |
+
" collection_name=collection,\n",
|
40 |
+
" query_vector = embeddingsModel.embed_query(text),\n",
|
41 |
+
" limit=10\n",
|
42 |
+
" )\n",
|
43 |
+
" return results"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 5,
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [
|
51 |
+
{
|
52 |
+
"name": "stdout",
|
53 |
+
"output_type": "stream",
|
54 |
+
"text": [
|
55 |
+
"Query expansion: Create a user-friendly, community-driven guide that provides an alternative to the traditional ROS documentation, focusing on real-world scenarios and practical applications rather than technical specifications and developer guides.\n",
|
56 |
+
"Coding Question?: 1\n",
|
57 |
+
"Related Collection: Github\n",
|
58 |
+
"Top texts: #About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\n",
|
59 |
+
"Link: https://github.com/ros2/ros2/tree/rolling/README.md\n",
|
60 |
+
"Top texts: type:git url:https://github.com/ros2/tinyxml2_vendor.git version:rolling ros2/tlsf: type:git url:https://github.com/ros2/tlsf.git version:rolling ros2/unique_identifier_msgs: type:git url:https://github.com/ros2/unique_identifier_msgs.git version:rolling ros2/urdf: type:git url:https://github.com/ros2/urdf.git version:rolling ros2/yaml_cpp_vendor: type:git url:https://github.com/ros2/yaml_cpp_vendor.git version:rolling\n",
|
61 |
+
"Link: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
|
62 |
+
"Top texts: *[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)\n",
|
63 |
+
"Link: https://github.com/ros2/ros2/tree/rolling/README.md\n"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"data": {
|
68 |
+
"text/plain": [
|
69 |
+
"\"Here's an example of what the README file for ROS could look like:\\n\\n**Welcome to the Robot Operating System (ROS)**\\n\\nROS is a set of software libraries and tools that help you build robot applications. From driver development to state-of-the-art algorithms, and with powerful development tools, ROS has everything you need for your next robotics project.\\n\\n### Getting Started\\n\\nTo get started with ROS, check out our [installation guide](https://www.ros.org/blog/getting-started/).\\n\\n### What's Included\\n\\nROS includes a range of open-source projects, including:\\n\\n* **tinyxml2_vendor**: A fork of the tinyxml2 library for parsing XML files.\\n* **tlsf**: A library for secure communication over TLS (Transport Layer Security).\\n* **unique_identifier_msgs**: A package for generating unique identifiers for robots and other entities.\\n* **urdf**: A package for working with URDF (Unified Robot Description Format) files.\\n* **yaml_cpp_vendor**: A fork of the yaml-cpp library for parsing YAML files.\\n\\n### ROS Releases and Target Platforms\\n\\nFor more information on ROS releases, target platforms, and release notes, check out [REP-2000](https://ros.org/reps/rep-2000.html).\\n\\n### Project Resources\\n\\n* **ROSSwag**: Purchase ROS-related merchandise from our online store.\\n* **ROS Trademark Information**: Learn about the ROS trademark.\\n\\n### Get Involved\\n\\nStay up-to-date with the latest news and developments in ROS:\\n\\n* Follow us on [LinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation)\\n* Join our Twitter community: [OpenRoboticsOrg](https://twitter.com/OpenRoboticsOrg), [ROSOrg](https://twitter.com/ROSOrg)\\n\\n### License and Contributions\\n\\nROS is an open-source project, licensed under the Apache 2.0 license.\\n\\nWe welcome contributions from the ROS community! If you have any ideas or bug fixes to contribute, check out our [contribution guidelines](https://ros.org/blog/contribute/).\\n\\n**Thank You**\\n\\nThanks for choosing ROS as your platform for robotics development!\\n\\nYou can modify this README file according to your needs and preferences.\""
|
70 |
+
]
|
71 |
+
},
|
72 |
+
"execution_count": 5,
|
73 |
+
"metadata": {},
|
74 |
+
"output_type": "execute_result"
|
75 |
+
}
|
76 |
+
],
|
77 |
+
"source": [
|
78 |
+
"# User query\n",
|
79 |
+
"query = \"Can you create a README file for ROS\"\n",
|
80 |
+
"\n",
|
81 |
+
"# Query expansion(I only generate one additional prompt for simplicity)\n",
|
82 |
+
"template = \"\"\"\n",
|
83 |
+
"Rewrite the prompt. The new prompt must offer a different perspective.\n",
|
84 |
+
"Do not change the meaning. Output only the rewritten prompt with no introduction.\n",
|
85 |
+
" Prompt: {prompt}\n",
|
86 |
+
"\"\"\"\n",
|
87 |
+
"prompt = PromptTemplate.from_template(template)\n",
|
88 |
+
"chain = {\"prompt\": itemgetter(\"prompt\")} | prompt | model\n",
|
89 |
+
"queryExpansion = chain.invoke({\"prompt\": query})\n",
|
90 |
+
"print(\"Query expansion: \", queryExpansion)\n",
|
91 |
+
"\n",
|
92 |
+
"# Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)\n",
|
93 |
+
"template = \"\"\"\n",
|
94 |
+
"You are an AI assistant. You must determine if the prompt requires code as the answer.\n",
|
95 |
+
"Output a 1 if it is or a 0 if it is not and nothing else.\n",
|
96 |
+
" Prompt: {prompt}\n",
|
97 |
+
"\"\"\"\n",
|
98 |
+
"prompt = PromptTemplate.from_template(template)\n",
|
99 |
+
"chain = {\"prompt\": itemgetter(\"prompt\")} | prompt | model\n",
|
100 |
+
"codingQuestion = chain.invoke({\"prompt\": query})\n",
|
101 |
+
"print(\"Coding Question?: \", codingQuestion)\n",
|
102 |
+
"\n",
|
103 |
+
"# Filtered vector search for each of the N queries after expansion\n",
|
104 |
+
"relatedCollection = 'Document'\n",
|
105 |
+
"if (codingQuestion == '1'):\n",
|
106 |
+
" relatedCollection = 'Github'\n",
|
107 |
+
"print(\"Related Collection: \", relatedCollection)\n",
|
108 |
+
"results1 = retriever(query, relatedCollection)\n",
|
109 |
+
"results2 = retriever(queryExpansion, relatedCollection)\n",
|
110 |
+
"\n",
|
111 |
+
"# Collecting results\n",
|
112 |
+
"results = results1+results2\n",
|
113 |
+
"\n",
|
114 |
+
"# Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)\n",
|
115 |
+
"ids = [result.id for result in results]\n",
|
116 |
+
"scores = [result.score for result in results]\n",
|
117 |
+
"topIds = []\n",
|
118 |
+
"topIndexes = []\n",
|
119 |
+
"for x in range(3):\n",
|
120 |
+
" maxScore = 0\n",
|
121 |
+
" maxIndex = 0\n",
|
122 |
+
" for i in range(len(ids)):\n",
|
123 |
+
" if ids[i] not in topIds and scores[i] > maxScore:\n",
|
124 |
+
" maxScore = scores[i]\n",
|
125 |
+
" maxIndex = i\n",
|
126 |
+
" topIds.append(ids[maxIndex])\n",
|
127 |
+
" topIndexes.append(maxIndex)\n",
|
128 |
+
"texts = [result.payload['text'] for result in results]\n",
|
129 |
+
"links = [result.payload['link'] for result in results]\n",
|
130 |
+
"topTexts = ''\n",
|
131 |
+
"for index in topIndexes:\n",
|
132 |
+
" print(\"Top texts: \", texts[index])\n",
|
133 |
+
" print(\"Link: \", links[index])\n",
|
134 |
+
" topTexts += texts[index]\n",
|
135 |
+
"\n",
|
136 |
+
"# Building prompt\n",
|
137 |
+
"if(codingQuestion == '1'):\n",
|
138 |
+
" template = \"\"\"\n",
|
139 |
+
" Write code for the following question given the related coding document below.\n",
|
140 |
+
"\n",
|
141 |
+
" Document: {document}\n",
|
142 |
+
" Question: {question}\n",
|
143 |
+
" \"\"\"\n",
|
144 |
+
" prompt = PromptTemplate.from_template(template)\n",
|
145 |
+
"else:\n",
|
146 |
+
" template = \"\"\"\n",
|
147 |
+
" Answer the question based on the document below. If you can't answer the question, reply \"I don't know\"\n",
|
148 |
+
"\n",
|
149 |
+
" Document: {document}\n",
|
150 |
+
" Question: {question}\n",
|
151 |
+
" \"\"\"\n",
|
152 |
+
" prompt = PromptTemplate.from_template(template)\n",
|
153 |
+
"\n",
|
154 |
+
"# Obtaining answer\n",
|
155 |
+
"chain = {\"document\": itemgetter(\"document\"), \"question\": itemgetter(\"question\")} | prompt | model\n",
|
156 |
+
"chain.invoke({\"document\": topTexts, \"question\": query})"
|
157 |
+
]
|
158 |
+
}
|
159 |
+
],
|
160 |
+
"metadata": {
|
161 |
+
"kernelspec": {
|
162 |
+
"display_name": "Python 3",
|
163 |
+
"language": "python",
|
164 |
+
"name": "python3"
|
165 |
+
},
|
166 |
+
"language_info": {
|
167 |
+
"codemirror_mode": {
|
168 |
+
"name": "ipython",
|
169 |
+
"version": 3
|
170 |
+
},
|
171 |
+
"file_extension": ".py",
|
172 |
+
"mimetype": "text/x-python",
|
173 |
+
"name": "python",
|
174 |
+
"nbconvert_exporter": "python",
|
175 |
+
"pygments_lexer": "ipython3",
|
176 |
+
"version": "3.12.7"
|
177 |
+
}
|
178 |
+
},
|
179 |
+
"nbformat": 4,
|
180 |
+
"nbformat_minor": 2
|
181 |
+
}
|
project/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<h1>Installation:</h1>
|
2 |
+
<h3>Docker setup(easy):<h3>
|
3 |
+
|
4 |
+
* Clone the repository from huggingface
|
5 |
+
* Reopen the repository in a dev container
|
6 |
+
* Copy the .env.example into a new .env file in the project folder
|
7 |
+
* If you want to run files in the ClearML folder, fill out the ClearML env variables, otherwise no changes needed.
|
8 |
+
* Open a shell on the host machine(not the dev container) and navigate to the project folder
|
9 |
+
* Run "docker compose up -d"
|
10 |
+
* Run "docker exec -it ollama ollama pull llama3.2"
|
11 |
+
* Select the python 3.12.7 kernels for the notebooks and run DataCollectionPipeline.ipynb and FeaturePipeline.ipynb(to populate the mongodb and qdrant databases)
|
12 |
+
* The app is available on localhost:7860
|
13 |
+
|
14 |
+
<h3>Non-Docker(web based) setup:<h3>
|
15 |
+
|
16 |
+
If for some reason the docker setup does not work try connecting to mongodb, qdrant, ollama, and gradio from the web:
|
17 |
+
* Clone the repository from huggingface or the entire repository from github
|
18 |
+
* Reopen the repository in a dev container
|
19 |
+
* Copy the .env.example into a new .env file in the project folder
|
20 |
+
* Modify the .env file as instructed in the comments(create accounts for each website)
|
21 |
+
* Install ollama in the dev container
|
22 |
+
* curl -fsSL https://ollama.com/install.sh | sh
|
23 |
+
* Start up ollama
|
24 |
+
* ollama serve
|
25 |
+
* Download llama3.2(in a new dev container terminal)
|
26 |
+
* ollama pull llama3.2
|
27 |
+
* Select the python 3.12.7 kernels for the notebooks and run DataCollectionPipeline.ipynb and FeaturePipeline.ipynb(to populate the mongodb and qdrant databases)
|
28 |
+
* Run app.py and click on the link
|
29 |
+
|
30 |
+
<h1>Project infrastructure</h1>
|
31 |
+
|
32 |
+
Note some files may have similar code with other files, such as the ClearML files containing ipynb files rewritten in python in order to work in ClearML or gradio containing code from InferencePipeline.ipynb. The ipynb file prints output to help see what is happening.
|
33 |
+
|
34 |
+
# app.py
|
35 |
+
Sends a query to the inference pipeline to generate an answer. The DataCollectionPipeline.ipynb and FeaturePipeline.ipynb files must be run first to populate the databases.
|
36 |
+
|
37 |
+
# Data Collection Pipeline
|
38 |
+
The Data Collection pipeline takes as input a list of links to domains. The links are fed into the ETL pipeline which Extracts data from the links using a crawler, Transforms the data into a standardized format, and Loads the extracted data into a NoSQL data warehouse, which in this case is MongoDB. The ETL pipeline uses a different method of extracting and transforming based on the link type. In this project, I classify links as either a github repository or document each with their own crawler and cleaner. This raw data is used by the feature pipeline.
|
39 |
+
|
40 |
+
# Feature Pipeline
|
41 |
+
The Feature pipeline contains the ingestion pipeline.
|
42 |
+
* The ingestion pipeline extracts documents from MongoDB that were stored by the Data Collection Pipeline. It further cleans the data, breaks it into chunks depending on the data category, passes the chunks through an embedding model to generate embeddings, then loads the embeddings plus their metadata into a vector database, which in this case is Qdrant. The embeddings are passed with additional metadata that contains the document link, type, chunk number, and content.
|
43 |
+
|
44 |
+
# Training Pipeline
|
45 |
+
The training pipeline performs finetuning. I skipped this step since it was not required.
|
46 |
+
|
47 |
+
# Inference Pipeline
|
48 |
+
The inference pipeline contains the retrieval client/pipeline.
|
49 |
+
* The retreival client takes a prompt as input. It uses the same embedding model as the ingestion pipeline in order to create an embedding for the prompt. It then queries the Qdrant database for the 10 closest embeddings using cosine distance and extracts the text chunk stored in the embeddings' metadata. This returns chunks that are related to the prompt.
|
50 |
+
* The inference pipeline takes a query as input. It expands the query into N=2 queries using a prompt template, performs self-querying to extract metadata (document type) from the original query, searches the Qdrant for K=10 relevant chunks to each of the N=2 queries plus metadata using the retrieval client, combines the K=10 results from each of the N=2 queries, filters out only the most relevant 3 results, prompts the LLM with the results as context, and generates an answer.
|
51 |
+
|
52 |
+
# ClearML
|
53 |
+
The ClearML folder contains the notebook (.ipynb) pipeline files rewritten to work with ClearML. It is similar code to the notebooks, however ClearML does not print any output but instead logs all output in website. The website stores the pipelines which take input and produces output stored in artifacts. These are the differences between the notebook(.ipynb) pipeline files and the ClearML pipeline files(.py):
|
54 |
+
* The ClearML Data Collection Pipeline works the same way, running the entire ETL pipeline in a single step (I could not split the ETL pipeline into 3 steps (Extract, Transform, Load) since my list of links gets bigger while looping through it(Since I also goes through some links inside of the websites crawled). Breaking it into steps would require more HTTP requests which would greatly slow down the pipeline).
|
55 |
+
* The Feature Pipeline breaks down the notebook's loop (from the ingestion pipeline) into 5 stages: retrieve documents, clean documents, chunk documents, embed chunks, and store embeddings.
|
56 |
+
* The Inference Pipeline simply puts each step in the notebook's version into a function. These functions are query expansion, self-querying, filtered vector search, collecting results, reranking, building prompt, and obtaining answer.
|
57 |
+
|
58 |
+
# Tools
|
59 |
+
The tools folder contains code for viewing/deleting what has been stored in MongoDB and Qdrant
|
60 |
+
|
61 |
+
# shared.py
|
62 |
+
shared.py is in both the project folder and project/Tools folder. It contains functions for setting up the connections with either the docker containers or web services. If you are running into errors connecting to any of the services, consider editing this file or double checking the .env file. Note the ClearML folder hardcodes all functions since it had trouble importing code.
|
project/Tools/QdrantTools.ipynb
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None\n",
|
13 |
+
"id=1 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 1, 'text': '24.04 (Noble) and Windows 10, though other systems are supported to varying degrees. Learn More Humble Hawksbill ROS 2 Humble Hawksbill is a slighly older LTS release of ROS 2 targeted at Ubuntu 22.04 (Jammy) and Windows 10. Other systems are supported including tier 3 support for 20.04 for those transitioning from ROS 1. Learn More Support There are several mechanisms in place to support the ROS community, each with its own purpose. Documentation Documentation and tutorials for ROS 2 Stack'} vector=None shard_key=None order_value=None\n",
|
14 |
+
"id=2 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n",
|
15 |
+
"id=3 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 3, 'text': 'Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS'} vector=None shard_key=None order_value=None\n",
|
16 |
+
"id=4 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
|
17 |
+
"Number of document chunks: 5\n",
|
18 |
+
"\n",
|
19 |
+
"Sample document chunk(metadata not the vector): \n",
|
20 |
+
"id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None \n",
|
21 |
+
"\n",
|
22 |
+
"id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None\n",
|
23 |
+
"id=1 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 1, 'text': \"Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum).\"} vector=None shard_key=None order_value=None\n",
|
24 |
+
"id=2 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 2, 'text': '*[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/)'} vector=None shard_key=None order_value=None\n",
|
25 |
+
"id=3 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 3, 'text': '*[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)'} vector=None shard_key=None order_value=None\n",
|
26 |
+
"id=4 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 4, 'text': 'ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode)'} vector=None shard_key=None order_value=None\n",
|
27 |
+
"id=5 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'type': 'Github', 'chunk': 0, 'text': '#Ignoredefaultnamesforcolconcreatedfolders build install log #Ignoreeverythinginsrcexcepta.gitkeepfile src/* !src/.gitkeep'} vector=None shard_key=None order_value=None\n",
|
28 |
+
"id=6 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'type': 'Github', 'chunk': 0, 'text': '#Thisfilewasgeneratedbyhttps://github.com/audrow/update-ros2-repos *@clalancette@codebot'} vector=None shard_key=None order_value=None\n",
|
29 |
+
"id=7 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 0, 'text': 'repositories: ament/ament_cmake: type:git url:https://github.com/ament/ament_cmake.git version:rolling ament/ament_index: type:git url:https://github.com/ament/ament_index.git version:rolling ament/ament_lint: type:git url:https://github.com/ament/ament_lint.git version:rolling ament/ament_package: type:git url:https://github.com/ament/ament_package.git version:rolling ament/google_benchmark_vendor: type:git url:https://github.com/ament/google_benchmark_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
|
30 |
+
"id=8 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 1, 'text': 'version:rolling ament/googletest: type:git url:https://github.com/ament/googletest.git version:rolling ament/uncrustify_vendor: type:git url:https://github.com/ament/uncrustify_vendor.git version:rolling eProsima/Fast-CDR: type:git url:https://github.com/eProsima/Fast-CDR.git version:2.2.x eProsima/Fast-DDS: type:git url:https://github.com/eProsima/Fast-DDS.git version:2.14.x eProsima/foonathan_memory_vendor: type:git url:https://github.com/eProsima/foonathan_memory_vendor.git version:master'} vector=None shard_key=None order_value=None\n",
|
31 |
+
"id=9 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 2, 'text': 'version:master eclipse-cyclonedds/cyclonedds: type:git url:https://github.com/eclipse-cyclonedds/cyclonedds.git version:releases/0.10.x eclipse-iceoryx/iceoryx: type:git url:https://github.com/eclipse-iceoryx/iceoryx.git version:release_2.0 gazebo-release/gz_cmake_vendor: type:git url:https://github.com/gazebo-release/gz_cmake_vendor.git version:rolling gazebo-release/gz_math_vendor: type:git url:https://github.com/gazebo-release/gz_math_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
|
32 |
+
"\n",
|
33 |
+
"Number of Github chunks: 10\n",
|
34 |
+
"\n",
|
35 |
+
"Sample Github chunk(metadata not the vector): \n",
|
36 |
+
"id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None \n",
|
37 |
+
"\n"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"name": "stderr",
|
42 |
+
"output_type": "stream",
|
43 |
+
"text": [
|
44 |
+
"/workspaces/RAG_LLM/project/Tools/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
|
45 |
+
" return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"name": "stdout",
|
50 |
+
"output_type": "stream",
|
51 |
+
"text": [
|
52 |
+
"\n",
|
53 |
+
"Sample search result(n=2): \n",
|
54 |
+
"id=4 version=4 score=0.38799083 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
|
55 |
+
"id=2 version=2 score=0.35047314 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n"
|
56 |
+
]
|
57 |
+
}
|
58 |
+
],
|
59 |
+
"source": [
|
60 |
+
"from shared import getQdrantClient, getEmbeddingsModel\n",
|
61 |
+
"qClient = getQdrantClient()\n",
|
62 |
+
"\n",
|
63 |
+
"# Show everything in the Document collection\n",
|
64 |
+
"numDocumentChunks = 0\n",
|
65 |
+
"# Note with_vectors defaults to false, so the vectors are not returned\n",
|
66 |
+
"chunks = qClient.scroll(collection_name='Document')\n",
|
67 |
+
"#print(chunks)\n",
|
68 |
+
"for chunk in chunks[0]:\n",
|
69 |
+
" # Only display chunks if vector database is small\n",
|
70 |
+
" print(chunk)\n",
|
71 |
+
" if numDocumentChunks == 0:\n",
|
72 |
+
" sampleDocumentChunk = chunk\n",
|
73 |
+
" numDocumentChunks += 1\n",
|
74 |
+
"print(\"Number of document chunks: \", numDocumentChunks)\n",
|
75 |
+
"if numDocumentChunks > 0:\n",
|
76 |
+
" print(\"\\nSample document chunk(metadata not the vector): \")\n",
|
77 |
+
" print(sampleDocumentChunk, '\\n')\n",
|
78 |
+
"\n",
|
79 |
+
"# Show everything in the Github collection\n",
|
80 |
+
"numGithubChunks = 0\n",
|
81 |
+
"chunks = qClient.scroll(collection_name='Github')\n",
|
82 |
+
"#print(chunks)\n",
|
83 |
+
"for chunk in chunks[0]:\n",
|
84 |
+
" # Only display chunks if vector database is small\n",
|
85 |
+
" print(chunk)\n",
|
86 |
+
" if numGithubChunks == 0:\n",
|
87 |
+
" sampleGithubChunk = chunk\n",
|
88 |
+
" numGithubChunks += 1\n",
|
89 |
+
"print(\"\\nNumber of Github chunks: \", numGithubChunks)\n",
|
90 |
+
"if numGithubChunks > 0:\n",
|
91 |
+
" print(\"\\nSample Github chunk(metadata not the vector): \")\n",
|
92 |
+
" print(sampleGithubChunk, '\\n')\n",
|
93 |
+
"\n",
|
94 |
+
"# Show a sample search\n",
|
95 |
+
"embeddingsModel = getEmbeddingsModel()\n",
|
96 |
+
"results = qClient.search(\n",
|
97 |
+
" collection_name=\"Document\",\n",
|
98 |
+
" query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
|
99 |
+
" limit=2\n",
|
100 |
+
")\n",
|
101 |
+
"print(\"\\nSample search result(n=2): \")\n",
|
102 |
+
"for result in results:\n",
|
103 |
+
" print(result)"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": 12,
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [
|
111 |
+
{
|
112 |
+
"name": "stdout",
|
113 |
+
"output_type": "stream",
|
114 |
+
"text": [
|
115 |
+
"Cosine Similarity for related sentences: 0.7035977848391597\n",
|
116 |
+
"Cosine Similarity for unrelated sentences: 0.3566534327076298\n"
|
117 |
+
]
|
118 |
+
}
|
119 |
+
],
|
120 |
+
"source": [
|
121 |
+
"import numpy as np\n",
|
122 |
+
"# How cosine distance works\n",
|
123 |
+
"\n",
|
124 |
+
"embedding1 = embeddingsModel.embed_query(\"What is the weather like?\")\n",
|
125 |
+
"embedding2 = embeddingsModel.embed_query(\"It is raining today.\")\n",
|
126 |
+
"embedding3 = embeddingsModel.embed_query(\"ROS is an open source platform\")\n",
|
127 |
+
"def cosine_similarity(vec1, vec2):\n",
|
128 |
+
" dot_product = np.dot(vec1, vec2)\n",
|
129 |
+
" norm_vec1 = np.linalg.norm(vec1)\n",
|
130 |
+
" norm_vec2 = np.linalg.norm(vec2)\n",
|
131 |
+
" return dot_product / (norm_vec1 * norm_vec2)\n",
|
132 |
+
"similarity1 = cosine_similarity(embedding1, embedding2)\n",
|
133 |
+
"similarity2 = cosine_similarity(embedding1, embedding3)\n",
|
134 |
+
"print(\"Cosine Similarity for related sentences:\", similarity1)\n",
|
135 |
+
"print(\"Cosine Similarity for unrelated sentences:\", similarity2)"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 20,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [
|
143 |
+
{
|
144 |
+
"data": {
|
145 |
+
"text/plain": [
|
146 |
+
"True"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
"execution_count": 20,
|
150 |
+
"metadata": {},
|
151 |
+
"output_type": "execute_result"
|
152 |
+
}
|
153 |
+
],
|
154 |
+
"source": [
|
155 |
+
"from qdrant_client.http.models import Distance, VectorParams\n",
|
156 |
+
"# Delete all collections and vectors inside them\n",
|
157 |
+
"qClient.delete_collection(collection_name = \"Document\")\n",
|
158 |
+
"qClient.delete_collection(collection_name = \"Github\")\n",
|
159 |
+
"# Recreate the empty collections\n",
|
160 |
+
"qClient.create_collection(\n",
|
161 |
+
" collection_name = \"Document\",\n",
|
162 |
+
" vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
|
163 |
+
")\n",
|
164 |
+
"qClient.create_collection(\n",
|
165 |
+
" collection_name = \"Github\",\n",
|
166 |
+
" vectors_config=VectorParams(size=3072, distance=Distance.COSINE)\n",
|
167 |
+
")"
|
168 |
+
]
|
169 |
+
}
|
170 |
+
],
|
171 |
+
"metadata": {
|
172 |
+
"kernelspec": {
|
173 |
+
"display_name": "Python 3",
|
174 |
+
"language": "python",
|
175 |
+
"name": "python3"
|
176 |
+
},
|
177 |
+
"language_info": {
|
178 |
+
"codemirror_mode": {
|
179 |
+
"name": "ipython",
|
180 |
+
"version": 3
|
181 |
+
},
|
182 |
+
"file_extension": ".py",
|
183 |
+
"mimetype": "text/x-python",
|
184 |
+
"name": "python",
|
185 |
+
"nbconvert_exporter": "python",
|
186 |
+
"pygments_lexer": "ipython3",
|
187 |
+
"version": "3.12.7"
|
188 |
+
}
|
189 |
+
},
|
190 |
+
"nbformat": 4,
|
191 |
+
"nbformat_minor": 2
|
192 |
+
}
|
project/Tools/__pycache__/shared.cpython-312.pyc
ADDED
Binary file (4.07 kB). View file
|
|
project/Tools/mongoTools.ipynb
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"Number of regular documents: 1\n",
|
13 |
+
"Number of github documents: 5\n",
|
14 |
+
"Links crawled: ['https://www.ros.org/', 'https://github.com/ros2/ros2/tree/rolling/README.md', 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'https://github.com/ros2/ros2/tree/rolling/src/.gitkeep']\n",
|
15 |
+
"Sample regular document: {'_id': ObjectId('675531b926a728d5b045a2e5'), 'link': 'https://www.ros.org/', 'type': 'Document', 'content': ' ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and Windows 10, though other systems are supported to varying degrees. Learn More Humble Hawksbill ROS 2 Humble Hawksbill is a slighly older LTS release of ROS 2 targeted at Ubuntu 22.04 (Jammy) and Windows 10. Other systems are supported including tier 3 support for 20.04 for those transitioning from ROS 1. Learn More Support There are several mechanisms in place to support the ROS community, each with its own purpose. Documentation Documentation and tutorials for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | ©2021 Open Robotics '}\n",
|
16 |
+
"Sample github document {'_id': ObjectId('675531b926a728d5b045a2e6'), 'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'content': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/). Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum). *[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/) *[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg) ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode) \"}\n"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
],
|
20 |
+
"source": [
|
21 |
+
"# Shows the state of the mongo database\n",
|
22 |
+
"from shared import getMongoClient\n",
|
23 |
+
"mongoHost = getMongoClient()\n",
|
24 |
+
"mongoDatabase = mongoHost[\"twin\"]\n",
|
25 |
+
"mongoDocumentCollection = mongoDatabase[\"Document\"]\n",
|
26 |
+
"mongoGithubCollection = mongoDatabase[\"Github\"]\n",
|
27 |
+
"documents = mongoDocumentCollection.find()\n",
|
28 |
+
"codes = mongoGithubCollection.find()\n",
|
29 |
+
"numDocuments = 0\n",
|
30 |
+
"numCodes = 0\n",
|
31 |
+
"links = []\n",
|
32 |
+
"for document in documents:\n",
|
33 |
+
" links.append(document[\"link\"])\n",
|
34 |
+
" if numDocuments == 0:\n",
|
35 |
+
" sampleDocument = document\n",
|
36 |
+
" numDocuments += 1\n",
|
37 |
+
"for code in codes:\n",
|
38 |
+
" links.append(code[\"link\"])\n",
|
39 |
+
" if numCodes == 0:\n",
|
40 |
+
" sampleCode = code\n",
|
41 |
+
" numCodes += 1\n",
|
42 |
+
"print(\"Number of regular documents: \", numDocuments)\n",
|
43 |
+
"print(\"Number of github documents: \", numCodes)\n",
|
44 |
+
"print(\"Links crawled: \", links)\n",
|
45 |
+
"if (numDocuments > 0):\n",
|
46 |
+
" print(\"Sample regular document: \", sampleDocument)\n",
|
47 |
+
"else:\n",
|
48 |
+
" print(\"No documents\")\n",
|
49 |
+
"if (numCodes > 0):\n",
|
50 |
+
" print(\"Sample github document\", sampleCode)\n",
|
51 |
+
"else:\n",
|
52 |
+
" print(\"No github documents\")"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"cell_type": "code",
|
57 |
+
"execution_count": 6,
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [
|
60 |
+
{
|
61 |
+
"data": {
|
62 |
+
"text/plain": [
|
63 |
+
"DeleteResult({'n': 18, 'electionId': ObjectId('7fffffff000000000000016a'), 'opTime': {'ts': Timestamp(1733585625, 33), 't': 362}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1733585625, 33), 'signature': {'hash': b'\\xd0\\xa2\\xaf\\x1c?p\\xc5\\xd7\\x9a\\x1e\\x1f\\x15\\x1ews\\xdc\\xab)\\xf2B', 'keyId': 7395232362797203469}}, 'operationTime': Timestamp(1733585625, 33)}, acknowledged=True)"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
"execution_count": 6,
|
67 |
+
"metadata": {},
|
68 |
+
"output_type": "execute_result"
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"# Delete all files\n",
|
73 |
+
"mongoDocumentCollection.delete_many({})\n",
|
74 |
+
"mongoGithubCollection.delete_many({})"
|
75 |
+
]
|
76 |
+
}
|
77 |
+
],
|
78 |
+
"metadata": {
|
79 |
+
"kernelspec": {
|
80 |
+
"display_name": "Python 3",
|
81 |
+
"language": "python",
|
82 |
+
"name": "python3"
|
83 |
+
},
|
84 |
+
"language_info": {
|
85 |
+
"codemirror_mode": {
|
86 |
+
"name": "ipython",
|
87 |
+
"version": 3
|
88 |
+
},
|
89 |
+
"file_extension": ".py",
|
90 |
+
"mimetype": "text/x-python",
|
91 |
+
"name": "python",
|
92 |
+
"nbconvert_exporter": "python",
|
93 |
+
"pygments_lexer": "ipython3",
|
94 |
+
"version": "3.12.7"
|
95 |
+
}
|
96 |
+
},
|
97 |
+
"nbformat": 4,
|
98 |
+
"nbformat_minor": 2
|
99 |
+
}
|
project/Tools/shared.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file contains shared functions used by multiple files
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
|
5 |
+
import pymongo
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
8 |
+
from langchain_openai.chat_models import ChatOpenAI
|
9 |
+
from qdrant_client import QdrantClient
|
10 |
+
from langchain_community.llms import Ollama
|
11 |
+
|
12 |
+
|
13 |
+
# Unused since usage limit reached since years ago...rip
|
14 |
+
def getOpenAiModel():
|
15 |
+
MODEL = "gpt-3.5-turbo"
|
16 |
+
try:
|
17 |
+
load_dotenv(override=True)
|
18 |
+
except Exception:
|
19 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
+
return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
|
22 |
+
|
23 |
+
|
24 |
+
# Create a mongoDB connection
|
25 |
+
def getMongoClient():
|
26 |
+
try:
|
27 |
+
load_dotenv(override=True)
|
28 |
+
except Exception:
|
29 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
30 |
+
DATABASE_HOST = os.getenv("DATABASE_HOST")
|
31 |
+
return pymongo.MongoClient(DATABASE_HOST)
|
32 |
+
|
33 |
+
|
34 |
+
# Create a qdrant connection
|
35 |
+
def getQdrantClient():
|
36 |
+
try:
|
37 |
+
load_dotenv(override=True)
|
38 |
+
except Exception:
|
39 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
40 |
+
USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
|
41 |
+
QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
|
42 |
+
QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
|
43 |
+
if USE_QDRANT_CLOUD:
|
44 |
+
return QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
|
45 |
+
else:
|
46 |
+
return QdrantClient(url=QDRANT_CLOUD_URL)
|
47 |
+
|
48 |
+
|
49 |
+
# Setup the text embedder
|
50 |
+
def getEmbeddingsModel(MODEL="llama3.2"):
|
51 |
+
try:
|
52 |
+
load_dotenv(override=True)
|
53 |
+
except Exception:
|
54 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
55 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
56 |
+
if USE_DOCKER == "True":
|
57 |
+
return OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
|
58 |
+
else:
|
59 |
+
return OllamaEmbeddings(model=MODEL)
|
60 |
+
|
61 |
+
|
62 |
+
# Setup the model
|
63 |
+
def getModel(MODEL="llama3.2"):
|
64 |
+
try:
|
65 |
+
load_dotenv(override=True)
|
66 |
+
except Exception:
|
67 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
68 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
69 |
+
if USE_DOCKER == "True":
|
70 |
+
return Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
|
71 |
+
else:
|
72 |
+
return Ollama(model=MODEL)
|
73 |
+
|
74 |
+
|
75 |
+
# Setup clearML
|
76 |
+
def setupClearML():
|
77 |
+
try:
|
78 |
+
load_dotenv(override=True)
|
79 |
+
except Exception:
|
80 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
81 |
+
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
|
82 |
+
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
|
83 |
+
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
|
84 |
+
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
85 |
+
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
86 |
+
return (
|
87 |
+
CLEARML_WEB_HOST,
|
88 |
+
CLEARML_API_HOST,
|
89 |
+
CLEARML_FILES_HOST,
|
90 |
+
CLEARML_API_ACCESS_KEY,
|
91 |
+
CLEARML_API_SECRET_KEY,
|
92 |
+
)
|
project/TrainingPipeline.ipynb
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"# Fine-tuning not required"
|
10 |
+
]
|
11 |
+
}
|
12 |
+
],
|
13 |
+
"metadata": {
|
14 |
+
"kernelspec": {
|
15 |
+
"display_name": ".venv",
|
16 |
+
"language": "python",
|
17 |
+
"name": "python3"
|
18 |
+
},
|
19 |
+
"language_info": {
|
20 |
+
"codemirror_mode": {
|
21 |
+
"name": "ipython",
|
22 |
+
"version": 3
|
23 |
+
},
|
24 |
+
"file_extension": ".py",
|
25 |
+
"mimetype": "text/x-python",
|
26 |
+
"name": "python",
|
27 |
+
"nbconvert_exporter": "python",
|
28 |
+
"pygments_lexer": "ipython3",
|
29 |
+
"version": "3.11.9"
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"nbformat": 4,
|
33 |
+
"nbformat_minor": 2
|
34 |
+
}
|
project/__pycache__/shared.cpython-312.pyc
ADDED
Binary file (4.08 kB). View file
|
|
project/app.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Make sure you have run "ollama serve"
|
2 |
+
# This is the same code as ClearML
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from operator import itemgetter
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
11 |
+
from langchain_community.llms import Ollama
|
12 |
+
from qdrant_client import QdrantClient
|
13 |
+
from shared import getModel, getEmbeddingsModel, getQdrantClient
|
14 |
+
|
15 |
+
def answer(samplePrompt, useSample, Query):
|
16 |
+
if useSample:
|
17 |
+
query = samplePrompt
|
18 |
+
else:
|
19 |
+
query = Query
|
20 |
+
# Create a qdrant connection
|
21 |
+
qClient = getQdrantClient()
|
22 |
+
|
23 |
+
# Setup the text embedder
|
24 |
+
embeddingsModel = getEmbeddingsModel()
|
25 |
+
|
26 |
+
# Setup the model
|
27 |
+
model = getModel()
|
28 |
+
|
29 |
+
# Retrieval Pipeline
|
30 |
+
# Retrieve the chunks with the most similar embeddings from Qdrant
|
31 |
+
def retriever(text, collection):
|
32 |
+
results = qClient.search(
|
33 |
+
collection_name=collection,
|
34 |
+
query_vector = embeddingsModel.embed_query(text),
|
35 |
+
limit=10
|
36 |
+
)
|
37 |
+
return results
|
38 |
+
|
39 |
+
# Query expansion(I only generate one additional prompt for simplicity)
|
40 |
+
template = """
|
41 |
+
Rewrite the prompt. The new prompt must offer a different perspective.
|
42 |
+
Do not change the meaning. Output only the rewritten prompt with no introduction.
|
43 |
+
Prompt: {prompt}
|
44 |
+
"""
|
45 |
+
prompt = PromptTemplate.from_template(template)
|
46 |
+
chain = {"prompt": itemgetter("prompt")} | prompt | model
|
47 |
+
queryExpansion = chain.invoke({"prompt": query})
|
48 |
+
|
49 |
+
# Self-querying(The metadata I will be generating determines whether to look through the Qdrant collection containing github code)
|
50 |
+
template = """
|
51 |
+
You are an AI assistant. You must determine if the prompt requires code as the answer.
|
52 |
+
Output a 1 if it is or a 0 if it is not and nothing else.
|
53 |
+
Prompt: {prompt}
|
54 |
+
"""
|
55 |
+
prompt = PromptTemplate.from_template(template)
|
56 |
+
chain = {"prompt": itemgetter("prompt")} | prompt | model
|
57 |
+
codingQuestion = chain.invoke({"prompt": query})
|
58 |
+
|
59 |
+
# Filtered vector search for each of the N queries after expansion
|
60 |
+
relatedCollection = 'Document'
|
61 |
+
if (codingQuestion == '1'):
|
62 |
+
relatedCollection = 'Github'
|
63 |
+
results1 = retriever(query, relatedCollection)
|
64 |
+
results2 = retriever(queryExpansion, relatedCollection)
|
65 |
+
|
66 |
+
# Collecting results
|
67 |
+
results = results1+results2
|
68 |
+
|
69 |
+
# Reranking(Instead of using a CrossEncoder, I will manually compare embeddings)
|
70 |
+
ids = [result.id for result in results]
|
71 |
+
scores = [result.score for result in results]
|
72 |
+
topIds = []
|
73 |
+
topIndexes = []
|
74 |
+
for x in range(3):
|
75 |
+
maxScore = 0
|
76 |
+
maxIndex = 0
|
77 |
+
for i in range(len(ids)):
|
78 |
+
if ids[i] not in topIds and scores[i] > maxScore:
|
79 |
+
maxScore = scores[i]
|
80 |
+
maxIndex = i
|
81 |
+
topIds.append(ids[maxIndex])
|
82 |
+
topIndexes.append(maxIndex)
|
83 |
+
texts = [result.payload['text'] for result in results]
|
84 |
+
links = [result.payload['link'] for result in results]
|
85 |
+
topTexts = ''
|
86 |
+
for index in topIndexes:
|
87 |
+
print("Top texts: ", texts[index])
|
88 |
+
print("Link: ", links[index])
|
89 |
+
topTexts += texts[index]
|
90 |
+
|
91 |
+
# Building prompt
|
92 |
+
if(codingQuestion == '1'):
|
93 |
+
template = """
|
94 |
+
Write code for the following question given the related coding document below.
|
95 |
+
|
96 |
+
Document: {document}
|
97 |
+
Question: {question}
|
98 |
+
"""
|
99 |
+
prompt = PromptTemplate.from_template(template)
|
100 |
+
else:
|
101 |
+
template = """
|
102 |
+
Answer the question based on the document below. If you can't answer the question, reply "I don't know"
|
103 |
+
|
104 |
+
Document: {document}
|
105 |
+
Question: {question}
|
106 |
+
"""
|
107 |
+
prompt = PromptTemplate.from_template(template)
|
108 |
+
|
109 |
+
# Obtaining answer
|
110 |
+
chain = {"document": itemgetter("document"), "question": itemgetter("question")} | prompt | model
|
111 |
+
return chain.invoke({"document": topTexts, "question": query})
|
112 |
+
|
113 |
+
|
114 |
+
demo = gr.Interface(
|
115 |
+
fn=answer,
|
116 |
+
inputs=[
|
117 |
+
gr.Dropdown(
|
118 |
+
["What is ROS?", "Write me code to move a robot"], label="Sample Prompt"
|
119 |
+
),
|
120 |
+
"checkbox",
|
121 |
+
"text",
|
122 |
+
],
|
123 |
+
outputs=["text"],
|
124 |
+
)
|
125 |
+
|
126 |
+
demo.launch(share=False)
|
project/docker-compose.yml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
ollama:
|
3 |
+
container_name: ollama
|
4 |
+
image: ollama/ollama:latest
|
5 |
+
deploy:
|
6 |
+
resources:
|
7 |
+
reservations:
|
8 |
+
devices:
|
9 |
+
- driver: nvidia
|
10 |
+
capabilities: ["gpu"]
|
11 |
+
count: all
|
12 |
+
ports:
|
13 |
+
- 11434:11434
|
14 |
+
expose:
|
15 |
+
- 11434
|
16 |
+
volumes:
|
17 |
+
- ollama:/root/.ollama
|
18 |
+
gradio:
|
19 |
+
build:
|
20 |
+
context: .
|
21 |
+
dockerfile: Dockerfile
|
22 |
+
ports:
|
23 |
+
- 7860:7860
|
24 |
+
expose:
|
25 |
+
- 7860
|
26 |
+
networks:
|
27 |
+
- net
|
28 |
+
environment:
|
29 |
+
- GRADIO_SERVER_NAME=0.0.0.0
|
30 |
+
- GRADIO_SERVER_PORT=7860
|
31 |
+
mongo:
|
32 |
+
image: mongo:latest
|
33 |
+
ports:
|
34 |
+
- 27017:27017
|
35 |
+
expose:
|
36 |
+
- 27017
|
37 |
+
volumes:
|
38 |
+
- mongo-data:/data/db
|
39 |
+
qdrant:
|
40 |
+
image: qdrant/qdrant:latest
|
41 |
+
restart: always
|
42 |
+
ports:
|
43 |
+
- 6333:6333
|
44 |
+
- 6334:6334
|
45 |
+
expose:
|
46 |
+
- 6333
|
47 |
+
- 6334
|
48 |
+
- 6335
|
49 |
+
configs:
|
50 |
+
- source: qdrant_config
|
51 |
+
target: /qdrant/config/production.yaml
|
52 |
+
volumes:
|
53 |
+
- qdrant-data:/qdrant/storage
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
configs:
|
59 |
+
qdrant_config:
|
60 |
+
content: |
|
61 |
+
log_level: INFO
|
62 |
+
|
63 |
+
volumes:
|
64 |
+
mongo-data:
|
65 |
+
driver: local
|
66 |
+
qdrant-data:
|
67 |
+
driver: local
|
68 |
+
ollama:
|
69 |
+
driver: local
|
70 |
+
|
71 |
+
networks:
|
72 |
+
net:
|
project/shared.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file contains shared functions used by multiple files
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
|
5 |
+
import pymongo
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
8 |
+
from langchain_openai.chat_models import ChatOpenAI
|
9 |
+
from qdrant_client import QdrantClient
|
10 |
+
from langchain_community.llms import Ollama
|
11 |
+
|
12 |
+
|
13 |
+
# Unused since usage limit reached since years ago...rip
|
14 |
+
def getOpenAiModel():
|
15 |
+
MODEL = "gpt-3.5-turbo"
|
16 |
+
try:
|
17 |
+
load_dotenv(override=True)
|
18 |
+
except Exception:
|
19 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
+
return ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
|
22 |
+
|
23 |
+
|
24 |
+
# Create a mongoDB connection
|
25 |
+
def getMongoClient():
|
26 |
+
try:
|
27 |
+
load_dotenv(override=True)
|
28 |
+
except Exception:
|
29 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
30 |
+
DATABASE_HOST = os.getenv("DATABASE_HOST")
|
31 |
+
return pymongo.MongoClient(DATABASE_HOST)
|
32 |
+
|
33 |
+
|
34 |
+
# Create a qdrant connection
|
35 |
+
def getQdrantClient():
|
36 |
+
try:
|
37 |
+
load_dotenv(override=True)
|
38 |
+
except Exception:
|
39 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
40 |
+
USE_QDRANT_CLOUD = os.getenv("USE_QDRANT_CLOUD")
|
41 |
+
QDRANT_CLOUD_URL = os.getenv("QDRANT_CLOUD_URL")
|
42 |
+
QDRANT_APIKEY = os.getenv("QDRANT_APIKEY")
|
43 |
+
if USE_QDRANT_CLOUD=="True":
|
44 |
+
return QdrantClient(url=QDRANT_CLOUD_URL, api_key=QDRANT_APIKEY)
|
45 |
+
else:
|
46 |
+
return QdrantClient(url=QDRANT_CLOUD_URL)
|
47 |
+
|
48 |
+
|
49 |
+
# Setup the text embedder
|
50 |
+
def getEmbeddingsModel(MODEL="llama3.2"):
|
51 |
+
try:
|
52 |
+
load_dotenv(override=True)
|
53 |
+
except Exception:
|
54 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
55 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
56 |
+
if USE_DOCKER == "True":
|
57 |
+
return OllamaEmbeddings(model=MODEL, base_url="http://host.docker.internal:11434")
|
58 |
+
else:
|
59 |
+
return OllamaEmbeddings(model=MODEL)
|
60 |
+
|
61 |
+
|
62 |
+
# Setup the model
|
63 |
+
def getModel(MODEL="llama3.2"):
|
64 |
+
try:
|
65 |
+
load_dotenv(override=True)
|
66 |
+
except Exception:
|
67 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
68 |
+
USE_DOCKER = os.getenv("USE_DOCKER")
|
69 |
+
if USE_DOCKER == "True":
|
70 |
+
return Ollama(model=MODEL, base_url="http://host.docker.internal:11434")
|
71 |
+
else:
|
72 |
+
return Ollama(model=MODEL)
|
73 |
+
|
74 |
+
|
75 |
+
# Setup clearML
|
76 |
+
def setupClearML():
|
77 |
+
try:
|
78 |
+
load_dotenv(override=True)
|
79 |
+
except Exception:
|
80 |
+
load_dotenv(sys.path[1] + "/.env", override=True)
|
81 |
+
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
|
82 |
+
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
|
83 |
+
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
|
84 |
+
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
85 |
+
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
86 |
+
return (
|
87 |
+
CLEARML_WEB_HOST,
|
88 |
+
CLEARML_API_HOST,
|
89 |
+
CLEARML_FILES_HOST,
|
90 |
+
CLEARML_API_ACCESS_KEY,
|
91 |
+
CLEARML_API_SECRET_KEY,
|
92 |
+
)
|