File size: 16,207 Bytes
2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 4779f10 2af0eb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Visiting link: https://www.ros.org/\n",
"Visiting link: https://docs.nav2.org/\n",
"Visiting link: https://moveit.ai/\n",
"Visiting link: https://gazebosim.org/home\n",
"Visiting link: https://github.com/ros2/ros2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Cloning into 'ros2'...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Visiting link: https://github.com/ros-navigation/navigation2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Cloning into 'navigation2'...\n",
"fatal: early EOF\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 36\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 37\u001b[0m tree \u001b[38;5;241m=\u001b[39m {}\n",
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 552\u001b[0m process\u001b[38;5;241m.\u001b[39mkill()\n",
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1199\u001b[0m stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 1200\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1262\u001b[0m endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2052\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n",
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m 2013\u001b[0m \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 2014\u001b[0m \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 2015\u001b[0m \u001b[38;5;66;03m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 2016\u001b[0m pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"# See README for more info on how the DataCollectionPipeline works\n",
"# The ETL pipeline is part of the DataCollectionPipeline\n",
"# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import os\n",
"import shutil\n",
"import subprocess\n",
"import tempfile\n",
"import urllib.parse\n",
"from shared import getMongoClient\n",
"\n",
"# Input into the Data Collection Pipeline is a list of links to domains\n",
"links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
"\n",
"# Create a mongoDB connection\n",
"mongoHost = getMongoClient()\n",
"mongoDatabase = mongoHost[\"twin\"]\n",
"# ETL pipeline\n",
"# Extract data from links and their subdirectories(using crawlers)\n",
"codes = []\n",
"for link in links:\n",
" # Web scraper/crawler for github links\n",
" if \"https://github.com\" in link:\n",
" # Do not revisit a link already in the database\n",
" mongoCollection = mongoDatabase[\"Github\"]\n",
" result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n",
" if result is None:\n",
" print(\"Visiting link: \", link)\n",
" # Modified GithubCrawler from LLM-Engineer for scraping github\n",
" local_temp = tempfile.mkdtemp()\n",
" try:\n",
" os.chdir(local_temp)\n",
" subprocess.run([\"git\", \"clone\", link])\n",
" repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n",
" tree = {}\n",
" for root, _, files in os.walk(repo_path):\n",
" dir = root.replace(repo_path, \"\").lstrip(\"/\")\n",
" if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
" continue\n",
" for file in files:\n",
" if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
" continue\n",
" file_path = os.path.join(dir, file)\n",
" with open(\n",
" os.path.join(root, file), \"r\", errors=\"ignore\"\n",
" ) as f:\n",
" tree[file_path] = f.read().replace(\" \", \"\")\n",
" except Exception:\n",
" print(f\"Error scrapping {link}\")\n",
" finally:\n",
" shutil.rmtree(local_temp)\n",
" # Correct the link\n",
" r = requests.get(link)\n",
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
" # Find the file path to any of the files in the repository\n",
" link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n",
" path = link_element.get(\"href\")\n",
" path = path.rsplit(\"/\", 1)[0]\n",
" # Push all the subdirectories to mongo\n",
" for subdirectory in tree:\n",
" #print(\n",
" # f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
" #)\n",
" text = tree[subdirectory]\n",
" # Transform the data\n",
" # Get rid of repeating \\n characters and spaces\n",
" text = text.replace(\"\\t\", \" \")\n",
" text = text.replace(\"\\n\", \" \")\n",
" text_len = len(text)\n",
" for i in range(text_len):\n",
" while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
" text = text[:i] + text[i + 1 :]\n",
" text_len -= 1\n",
" codes.append(\n",
" {\n",
" \"link\": \"https://github.com\"\n",
" + path\n",
" + \"/\"\n",
" + subdirectory,\n",
" \"type\": \"Github\",\n",
" \"content\": text,\n",
" }\n",
" )\n",
" mongoCollection.insert_many(codes)\n",
" codes = []\n",
" else:\n",
" print(\"Already visited: \", link)\n",
" # Web scraper/crawler for other links(Documents)\n",
" else:\n",
" # Do not revisit a link already in the database\n",
" mongoCollection = mongoDatabase[\"Document\"]\n",
" result = mongoCollection.find_one({\"link\": link})\n",
" if result is None:\n",
" print(\"Visiting link: \", link)\n",
" try:\n",
" # Get all text in the website\n",
" r = requests.get(link)\n",
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
" soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
" text = soup.get_text()\n",
" # Transform the data\n",
" # Get rid of repeating \\n characters and spaces\n",
" text = text.replace(\"\\t\", \" \")\n",
" text = text.replace(\"\\n\", \" \")\n",
" text_len = len(text)\n",
" for i in range(text_len):\n",
" while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
" text = text[:i] + text[i + 1 :]\n",
" text_len -= 1\n",
" if \"404\" not in text:\n",
" mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
" else:\n",
" print(\"Page not found: \", link)\n",
" # Also crawl through all subdirectorys in the link(related links)\n",
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
" subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
" for subdirectory in subdirectories:\n",
" newLink = urllib.parse.urljoin(link, subdirectory)\n",
" if (\n",
" subdirectory is not None and\n",
" 'http' not in subdirectory and\n",
" mongoCollection.find_one({\"link\": newLink}) is None\n",
" ):\n",
" #print(\"Adding subdirectory: \", link + subdirectory)\n",
" links.append(newLink)\n",
" except:\n",
" print(\"Could not crawl link\", link)\n",
" else:\n",
" print(\"Already visited: \", link)\n",
" # Avoid spamming sites\n",
" time.sleep(0.1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|