{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Visiting link:  https://www.ros.org/\n",
      "Visiting link:  https://docs.nav2.org/\n",
      "Visiting link:  https://moveit.ai/\n",
      "Visiting link:  https://gazebosim.org/home\n",
      "Visiting link:  https://github.com/ros2/ros2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cloning into 'ros2'...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Visiting link:  https://github.com/ros-navigation/navigation2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cloning into 'navigation2'...\n",
      "fatal: early EOF\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     34\u001b[0m     os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m     \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     36\u001b[0m     repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m     37\u001b[0m     tree \u001b[38;5;241m=\u001b[39m {}\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m    548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m    549\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m         stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    551\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m    552\u001b[0m         process\u001b[38;5;241m.\u001b[39mkill()\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m   1199\u001b[0m         stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m   1200\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1203\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   1262\u001b[0m     endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m   1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m   1266\u001b[0m     \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m   1267\u001b[0m     \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m   1268\u001b[0m     \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m   1269\u001b[0m     \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m   1270\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   2052\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m  \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m   2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m   2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m   2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m   2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m   2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m     (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m   2013\u001b[0m     \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m   2014\u001b[0m     \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m   2015\u001b[0m     \u001b[38;5;66;03m# process.  This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m   2016\u001b[0m     pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# See README for more info on how the DataCollectionPipeline works\n",
    "# The ETL pipeline is part of the DataCollectionPipeline\n",
    "# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import time\n",
    "import os\n",
    "import shutil\n",
    "import subprocess\n",
    "import tempfile\n",
    "import urllib.parse\n",
    "from shared import getMongoClient\n",
    "\n",
    "# Input into the Data Collection Pipeline is a list of links to domains\n",
    "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
    "\n",
    "# Create a mongoDB connection\n",
    "mongoHost =  getMongoClient()\n",
    "mongoDatabase =  mongoHost[\"twin\"]\n",
    "# ETL pipeline\n",
    "# Extract data from links and their subdirectories(using crawlers)\n",
    "codes = []\n",
    "for link in links:\n",
    "    # Web scraper/crawler for github links\n",
    "    if \"https://github.com\" in link:\n",
    "        # Do not revisit a link already in the database\n",
    "        mongoCollection = mongoDatabase[\"Github\"]\n",
    "        result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n",
    "        if result is None:\n",
    "            print(\"Visiting link: \", link)\n",
    "            # Modified GithubCrawler from LLM-Engineer for scraping github\n",
    "            local_temp = tempfile.mkdtemp()\n",
    "            try:\n",
    "                os.chdir(local_temp)\n",
    "                subprocess.run([\"git\", \"clone\", link])\n",
    "                repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n",
    "                tree = {}\n",
    "                for root, _, files in os.walk(repo_path):\n",
    "                    dir = root.replace(repo_path, \"\").lstrip(\"/\")\n",
    "                    if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
    "                        continue\n",
    "                    for file in files:\n",
    "                        if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
    "                            continue\n",
    "                        file_path = os.path.join(dir, file)\n",
    "                        with open(\n",
    "                            os.path.join(root, file), \"r\", errors=\"ignore\"\n",
    "                        ) as f:\n",
    "                            tree[file_path] = f.read().replace(\" \", \"\")\n",
    "            except Exception:\n",
    "                print(f\"Error scrapping {link}\")\n",
    "            finally:\n",
    "                shutil.rmtree(local_temp)\n",
    "                # Correct the link\n",
    "                r = requests.get(link)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                # Find the file path to any of the files in the repository\n",
    "                link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n",
    "                path = link_element.get(\"href\")\n",
    "                path = path.rsplit(\"/\", 1)[0]\n",
    "                # Push all the subdirectories to mongo\n",
    "                for subdirectory in tree:\n",
    "                    #print(\n",
    "                    #    f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
    "                    #)\n",
    "                    text = tree[subdirectory]\n",
    "                    # Transform the data\n",
    "                    # Get rid of repeating \\n characters and spaces\n",
    "                    text = text.replace(\"\\t\", \" \")\n",
    "                    text = text.replace(\"\\n\", \" \")\n",
    "                    text_len = len(text)\n",
    "                    for i in range(text_len):\n",
    "                        while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
    "                            text = text[:i] + text[i + 1 :]\n",
    "                            text_len -= 1\n",
    "                    codes.append(\n",
    "                        {\n",
    "                            \"link\": \"https://github.com\"\n",
    "                            + path\n",
    "                            + \"/\"\n",
    "                            + subdirectory,\n",
    "                            \"type\": \"Github\",\n",
    "                            \"content\": text,\n",
    "                        }\n",
    "                    )\n",
    "                mongoCollection.insert_many(codes)\n",
    "                codes = []\n",
    "        else:\n",
    "            print(\"Already visited: \", link)\n",
    "    # Web scraper/crawler for other links(Documents)\n",
    "    else:\n",
    "        # Do not revisit a link already in the database\n",
    "        mongoCollection = mongoDatabase[\"Document\"]\n",
    "        result = mongoCollection.find_one({\"link\": link})\n",
    "        if result is None:\n",
    "            print(\"Visiting link: \", link)\n",
    "            try:\n",
    "                # Get all text in the website\n",
    "                r = requests.get(link)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
    "                text = soup.get_text()\n",
    "                # Transform the data\n",
    "                # Get rid of repeating \\n characters and spaces\n",
    "                text = text.replace(\"\\t\", \" \")\n",
    "                text = text.replace(\"\\n\", \" \")\n",
    "                text_len = len(text)\n",
    "                for i in range(text_len):\n",
    "                    while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
    "                        text = text[:i] + text[i + 1 :]\n",
    "                        text_len -= 1\n",
    "                if \"404\" not in text:\n",
    "                    mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
    "                else:\n",
    "                    print(\"Page not found: \", link)\n",
    "                # Also crawl through all subdirectorys in the link(related links)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
    "                for subdirectory in subdirectories:\n",
    "                    newLink = urllib.parse.urljoin(link, subdirectory)\n",
    "                    if (\n",
    "                        subdirectory is not None and\n",
    "                        'http' not in subdirectory and\n",
    "                        mongoCollection.find_one({\"link\": newLink}) is None\n",
    "                    ):\n",
    "                        #print(\"Adding subdirectory: \", link + subdirectory)\n",
    "                        links.append(newLink)\n",
    "            except:\n",
    "                print(\"Could not crawl link\", link)\n",
    "        else:\n",
    "            print(\"Already visited: \", link)\n",
    "    # Avoid spamming sites\n",
    "    time.sleep(0.1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}