{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Visiting link: https://www.ros.org/\n", "Visiting link: https://docs.nav2.org/\n", "Visiting link: https://moveit.ai/\n", "Visiting link: https://gazebosim.org/home\n", "Visiting link: https://github.com/ros2/ros2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning into 'ros2'...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Visiting link: https://github.com/ros-navigation/navigation2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning into 'navigation2'...\n", "fatal: early EOF\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 36\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 37\u001b[0m tree \u001b[38;5;241m=\u001b[39m {}\n", "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 552\u001b[0m process\u001b[38;5;241m.\u001b[39mkill()\n", "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1199\u001b[0m stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 1200\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1262\u001b[0m endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2052\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n", "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m 2013\u001b[0m \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 2014\u001b[0m \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 2015\u001b[0m \u001b[38;5;66;03m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 2016\u001b[0m pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "# See README for more info on how the DataCollectionPipeline works\n", "# The ETL pipeline is part of the DataCollectionPipeline\n", "# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import time\n", "import os\n", "import shutil\n", "import subprocess\n", "import tempfile\n", "import urllib.parse\n", "from shared import getMongoClient\n", "\n", "# Input into the Data Collection Pipeline is a list of links to domains\n", "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n", "\n", "# Create a mongoDB connection\n", "mongoHost = getMongoClient()\n", "mongoDatabase = mongoHost[\"twin\"]\n", "# ETL pipeline\n", "# Extract data from links and their subdirectories(using crawlers)\n", "codes = []\n", "for link in links:\n", " # Web scraper/crawler for github links\n", " if \"https://github.com\" in link:\n", " # Do not revisit a link already in the database\n", " mongoCollection = mongoDatabase[\"Github\"]\n", " result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n", " if result is None:\n", " print(\"Visiting link: \", link)\n", " # Modified GithubCrawler from LLM-Engineer for scraping github\n", " local_temp = tempfile.mkdtemp()\n", " try:\n", " os.chdir(local_temp)\n", " subprocess.run([\"git\", \"clone\", link])\n", " repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n", " tree = {}\n", " for root, _, files in os.walk(repo_path):\n", " dir = root.replace(repo_path, \"\").lstrip(\"/\")\n", " if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n", " continue\n", " for file in files:\n", " if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n", " continue\n", " file_path = os.path.join(dir, file)\n", " with open(\n", " os.path.join(root, file), \"r\", errors=\"ignore\"\n", " ) as f:\n", " tree[file_path] = f.read().replace(\" \", \"\")\n", " except Exception:\n", " print(f\"Error scrapping {link}\")\n", " finally:\n", " shutil.rmtree(local_temp)\n", " # Correct the link\n", " r = requests.get(link)\n", " soup = BeautifulSoup(r.content, \"html.parser\")\n", " # Find the file path to any of the files in the repository\n", " link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n", " path = link_element.get(\"href\")\n", " path = path.rsplit(\"/\", 1)[0]\n", " # Push all the subdirectories to mongo\n", " for subdirectory in tree:\n", " #print(\n", " # f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n", " #)\n", " text = tree[subdirectory]\n", " # Transform the data\n", " # Get rid of repeating \\n characters and spaces\n", " text = text.replace(\"\\t\", \" \")\n", " text = text.replace(\"\\n\", \" \")\n", " text_len = len(text)\n", " for i in range(text_len):\n", " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n", " text = text[:i] + text[i + 1 :]\n", " text_len -= 1\n", " codes.append(\n", " {\n", " \"link\": \"https://github.com\"\n", " + path\n", " + \"/\"\n", " + subdirectory,\n", " \"type\": \"Github\",\n", " \"content\": text,\n", " }\n", " )\n", " mongoCollection.insert_many(codes)\n", " codes = []\n", " else:\n", " print(\"Already visited: \", link)\n", " # Web scraper/crawler for other links(Documents)\n", " else:\n", " # Do not revisit a link already in the database\n", " mongoCollection = mongoDatabase[\"Document\"]\n", " result = mongoCollection.find_one({\"link\": link})\n", " if result is None:\n", " print(\"Visiting link: \", link)\n", " try:\n", " # Get all text in the website\n", " r = requests.get(link)\n", " soup = BeautifulSoup(r.content, \"html.parser\")\n", " soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n", " text = soup.get_text()\n", " # Transform the data\n", " # Get rid of repeating \\n characters and spaces\n", " text = text.replace(\"\\t\", \" \")\n", " text = text.replace(\"\\n\", \" \")\n", " text_len = len(text)\n", " for i in range(text_len):\n", " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n", " text = text[:i] + text[i + 1 :]\n", " text_len -= 1\n", " if \"404\" not in text:\n", " mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n", " else:\n", " print(\"Page not found: \", link)\n", " # Also crawl through all subdirectorys in the link(related links)\n", " soup = BeautifulSoup(r.content, \"html.parser\")\n", " subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n", " for subdirectory in subdirectories:\n", " newLink = urllib.parse.urljoin(link, subdirectory)\n", " if (\n", " subdirectory is not None and\n", " 'http' not in subdirectory and\n", " mongoCollection.find_one({\"link\": newLink}) is None\n", " ):\n", " #print(\"Adding subdirectory: \", link + subdirectory)\n", " links.append(newLink)\n", " except:\n", " print(\"Could not crawl link\", link)\n", " else:\n", " print(\"Already visited: \", link)\n", " # Avoid spamming sites\n", " time.sleep(0.1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }